diff --git a/sft_pretrain/Full_smoe/added_tokens.json b/sft_pretrain/Full_smoe/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/added_tokens.json b/sft_pretrain/Full_smoe/checkpoint-3328/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/config.json b/sft_pretrain/Full_smoe/checkpoint-3328/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a5b687752b38ba994afe0584c1b87811b54cb708 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/generation_config.json b/sft_pretrain/Full_smoe/checkpoint-3328/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d49fa6c35f1e6f899d215d9a6fbf95d920fbfce --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:730df8f293df1a5914025f2d14e1bef4b56befcef1930e56d5b8289f916bbd77 +size 396582032 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..53e72f7ab4d63cae3170af1732e9d595121f6c1f --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53d629bc715f109fa62b1a38ca99a376b697651785eeba9925463beb9fae4efa +size 396582032 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..644f3aacf85162bfd2caced277fbd6dd79c9a670 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4fc1c4c73134de2f2f2e347f6405fafcb086760c07bd49f113e99c02b27b5d0 +size 396582032 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..54da6a780e73aebd0e51c42163476ba7c7729c25 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11c93b7ea852a910a7064fc62113250b944f3056c3d121b983f5220ad99845de +size 396582032 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..bbee0a7594ee198a24c61a395708da1b8f57a5c4 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aafbd61556872c03c4edb1fc6e9c553a07c6cab6823836342f0e6b926c05fd93 +size 2117321480 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..1202bfc9b58ab1c3b7f5387298eb410418c6dca9 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bb71539d4d115951af65853435d467f855a14091acc275f167c21787b26b229 +size 2117321480 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..99183d341a095ba06ca35199688ef54d161abf57 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc38f68173a04db909bfb37aa07e0d96c6a57d5f5a1b723feaec53b37642fbc1 +size 2117321480 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e78ffa0c162b0daa28d9e3f57c116d0a10912abf --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/global_step3328/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8b829d6fa8e4497669c7b2eb4f9f938d099510eff5709e2ee4f6504ea3e3599 +size 2117321480 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/latest b/sft_pretrain/Full_smoe/checkpoint-3328/latest new file mode 100644 index 0000000000000000000000000000000000000000..2c27d5aabecd1a20f5d8e01a05251ed2cf0a7fec --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/latest @@ -0,0 +1 @@ +global_step3328 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe/checkpoint-3328/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe/checkpoint-3328/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3a9e74914b40e5f0d2bb8dac3b044d41cb3611ae --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4078a79d88071b52a5b1a8047f5c25f79688fc005b8f9987221925263160f939 +size 3759025152 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/model.safetensors.index.json b/sft_pretrain/Full_smoe/checkpoint-3328/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_0.pth b/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_1.pth b/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_2.pth b/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_3.pth b/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/special_tokens_map.json b/sft_pretrain/Full_smoe/checkpoint-3328/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/tokenizer.model b/sft_pretrain/Full_smoe/checkpoint-3328/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/tokenizer_config.json b/sft_pretrain/Full_smoe/checkpoint-3328/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/trainer_state.json b/sft_pretrain/Full_smoe/checkpoint-3328/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e633ce1517201a4917e361403f07eb4e38684a02 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/trainer_state.json @@ -0,0 +1,49953 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6402462485571374, + "eval_steps": 500, + "global_step": 3328, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.851083319408797, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 22.685314178466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018389, + "balance_loss_mlp": 1.26880157, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.607348306835714, + "language_loss": 2.4131012, + "learning_rate": 0.00013726078121135892, + "loss": 2.43328524, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.48828125, + "step": 2, + "time_per_iteration": 2.6085429191589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02035932, + "balance_loss_mlp": 1.28710687, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.480566200669728, + "language_loss": 2.12185097, + "learning_rate": 0.00021755319103969496, + "loss": 2.14221001, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.48046875, + "step": 3, + "time_per_iteration": 2.817356824874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02060169, + "balance_loss_mlp": 1.30028164, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 4.363008200765646, + "language_loss": 1.37660766, + "learning_rate": 0.00027452156242271784, + "loss": 1.39720929, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.59375, + "step": 4, + "time_per_iteration": 2.7677674293518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02070568, + "balance_loss_mlp": 1.31411338, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.9313793007477466, + "language_loss": 1.33924747, + "learning_rate": 0.0003187096642208417, + "loss": 1.35995317, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.649566650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02075998, + "balance_loss_mlp": 1.31763589, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.3251391322215498, + "language_loss": 1.31535721, + "learning_rate": 0.0003548139722510539, + "loss": 1.33611727, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.715332269668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02105134, + "balance_loss_mlp": 1.3406682, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.8930621517096357, + "language_loss": 1.22756648, + "learning_rate": 0.00038533972973918044, + "loss": 1.24861789, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.640625, + "step": 7, + "time_per_iteration": 2.620546340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02159823, + "balance_loss_mlp": 1.38276935, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.2913046553555926, + "language_loss": 1.17756534, + "learning_rate": 0.0004117823436340768, + "loss": 1.19916344, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6581108570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02180456, + "balance_loss_mlp": 1.39310265, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.2812491955312875, + "language_loss": 1.24828589, + "learning_rate": 0.00043510638207938993, + "loss": 1.27009046, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.859375, + "step": 9, + "time_per_iteration": 2.7921459674835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220606, + "balance_loss_mlp": 1.43058181, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.32786568158439683, + "language_loss": 1.14205348, + "learning_rate": 0.00045597044543220066, + "loss": 1.16425967, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.8984375, + "step": 10, + "time_per_iteration": 2.7258670330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0223461, + "balance_loss_mlp": 1.43886435, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.1860296084342833, + "language_loss": 1.11914992, + "learning_rate": 0.00047484428652143135, + "loss": 1.14149594, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.953125, + "step": 11, + "time_per_iteration": 2.907498359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235376, + "balance_loss_mlp": 1.4423002, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11947281146450546, + "language_loss": 1.17959428, + "learning_rate": 0.0004920747534624128, + "loss": 1.20194793, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.92578125, + "step": 12, + "time_per_iteration": 2.6528539657592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218147, + "balance_loss_mlp": 1.42507148, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.12512408660007263, + "language_loss": 1.20210767, + "learning_rate": 0.0005079252465375872, + "loss": 1.22428906, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.921875, + "step": 13, + "time_per_iteration": 2.8123886585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02214103, + "balance_loss_mlp": 1.42140937, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.16684445783545154, + "language_loss": 1.10100055, + "learning_rate": 0.0005226005109505393, + "loss": 1.12314165, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.9140625, + "step": 14, + "time_per_iteration": 2.628995180130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130591, + "balance_loss_mlp": 1.36459994, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.1391159076902598, + "language_loss": 1.15644169, + "learning_rate": 0.0005362628552605367, + "loss": 1.17774749, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.65234375, + "step": 15, + "time_per_iteration": 2.650690793991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02123252, + "balance_loss_mlp": 1.36260176, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12794674976623602, + "language_loss": 1.19969535, + "learning_rate": 0.0005490431248454357, + "loss": 1.22092795, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.6015625, + "step": 16, + "time_per_iteration": 2.7189841270446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0240823, + "balance_loss_mlp": 1.66054928, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2699272965631097, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78113341, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.46875, + "step": 17, + "time_per_iteration": 5.958680868148804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02007176, + "balance_loss_mlp": 1.28352785, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.08195889268296155, + "language_loss": 1.0631001, + "learning_rate": 0.0005723671632907488, + "loss": 1.08317184, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.23046875, + "step": 18, + "time_per_iteration": 2.633267879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01953804, + "balance_loss_mlp": 1.2572403, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11150538237586374, + "language_loss": 1.11837816, + "learning_rate": 0.0005830738490244919, + "loss": 1.13791621, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.526186466217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920231, + "balance_loss_mlp": 1.24464774, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.09041890124841255, + "language_loss": 1.13942695, + "learning_rate": 0.0005932312266435596, + "loss": 1.15862942, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.76171875, + "step": 20, + "time_per_iteration": 2.8158531188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01861181, + "balance_loss_mlp": 1.21687818, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1379829587383013, + "language_loss": 1.09075773, + "learning_rate": 0.0006028929207788754, + "loss": 1.10936952, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.44140625, + "step": 21, + "time_per_iteration": 2.7115283012390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01816904, + "balance_loss_mlp": 1.19815993, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09955042249077097, + "language_loss": 1.11992621, + "learning_rate": 0.0006121050677327902, + "loss": 1.13809526, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.1796875, + "step": 22, + "time_per_iteration": 2.9170944690704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01769897, + "balance_loss_mlp": 1.18281531, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.08735903991507939, + "language_loss": 1.03007698, + "learning_rate": 0.0006209076479463684, + "loss": 1.04777598, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.8671875, + "step": 23, + "time_per_iteration": 2.6403517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01733821, + "balance_loss_mlp": 1.17191648, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.08709154861799764, + "language_loss": 1.12691391, + "learning_rate": 0.0006293355346737718, + "loss": 1.14425218, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.62890625, + "step": 24, + "time_per_iteration": 2.706193208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01681551, + "balance_loss_mlp": 1.14711165, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08429969570703955, + "language_loss": 1.08894634, + "learning_rate": 0.0006374193284416834, + "loss": 1.10576177, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.34765625, + "step": 25, + "time_per_iteration": 2.788973808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01660379, + "balance_loss_mlp": 1.15416873, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.1402992304177309, + "language_loss": 1.07612705, + "learning_rate": 0.0006451860277489461, + "loss": 1.09273076, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.05859375, + "step": 26, + "time_per_iteration": 2.6577279567718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01646, + "balance_loss_mlp": 1.17107058, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.16239245775773925, + "language_loss": 1.14940214, + "learning_rate": 0.0006526595731190848, + "loss": 1.16586208, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.73828125, + "step": 27, + "time_per_iteration": 2.4788224697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586113, + "balance_loss_mlp": 1.1497122, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.0939755899328463, + "language_loss": 1.08969474, + "learning_rate": 0.0006598612921618983, + "loss": 1.10555601, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.375, + "step": 28, + "time_per_iteration": 2.8451075553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01530584, + "balance_loss_mlp": 1.12393713, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08153278055262643, + "language_loss": 1.02661419, + "learning_rate": 0.0006668102665011454, + "loss": 1.04191995, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 4.0703125, + "step": 29, + "time_per_iteration": 3.3112235069274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149795, + "balance_loss_mlp": 1.11743355, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.14907920412525114, + "language_loss": 1.11315072, + "learning_rate": 0.0006735236364718957, + "loss": 1.1281302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.80273438, + "step": 30, + "time_per_iteration": 2.744025945663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444605, + "balance_loss_mlp": 1.09651423, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.1454661106932218, + "language_loss": 1.10029531, + "learning_rate": 0.0006800168558381346, + "loss": 1.11474133, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.484375, + "step": 31, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408917, + "balance_loss_mlp": 1.08962691, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.13886710462644744, + "language_loss": 1.12821865, + "learning_rate": 0.0006863039060567947, + "loss": 1.14230776, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.19140625, + "step": 32, + "time_per_iteration": 2.778316020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386345, + "balance_loss_mlp": 1.0916599, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.0950000822083296, + "language_loss": 1.06182003, + "learning_rate": 0.0006923974775611263, + "loss": 1.07568347, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.94726562, + "step": 33, + "time_per_iteration": 2.822932243347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377092, + "balance_loss_mlp": 1.10586727, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0933492164101247, + "language_loss": 1.02986193, + "learning_rate": 0.0006983091239737814, + "loss": 1.04363275, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.71484375, + "step": 34, + "time_per_iteration": 3.030482530593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01362684, + "balance_loss_mlp": 1.11224914, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.11255876729792032, + "language_loss": 1.0177412, + "learning_rate": 0.0007040493939600222, + "loss": 1.03136802, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.50195312, + "step": 35, + "time_per_iteration": 2.849836826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339928, + "balance_loss_mlp": 1.10723162, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.05318315286224845, + "language_loss": 1.02413034, + "learning_rate": 0.0007096279445021078, + "loss": 1.03752947, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.32421875, + "step": 36, + "time_per_iteration": 2.7724404335021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333933, + "balance_loss_mlp": 1.12202668, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09673231095327042, + "language_loss": 1.09330344, + "learning_rate": 0.0007150536386503726, + "loss": 1.10664272, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 2.12304688, + "step": 37, + "time_per_iteration": 2.87898588180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131197, + "balance_loss_mlp": 1.11932778, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.1501774474729275, + "language_loss": 1.02011764, + "learning_rate": 0.0007203346302358509, + "loss": 1.03323734, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.92578125, + "step": 38, + "time_per_iteration": 2.9664244651794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301697, + "balance_loss_mlp": 1.11916423, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.13354549864664766, + "language_loss": 1.06722176, + "learning_rate": 0.000725478437577282, + "loss": 1.08023882, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.82324219, + "step": 39, + "time_per_iteration": 2.8403327465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_mlp": 1.10262501, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.06892008670865749, + "language_loss": 1.01746094, + "learning_rate": 0.0007304920078549186, + "loss": 1.03015804, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.671875, + "step": 40, + "time_per_iteration": 2.7219579219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271556, + "balance_loss_mlp": 1.1131506, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.1603294487215327, + "language_loss": 1.03720689, + "learning_rate": 0.0007353817735343603, + "loss": 1.04992247, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.58300781, + "step": 41, + "time_per_iteration": 2.7060108184814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246674, + "balance_loss_mlp": 1.10390913, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.0511848053751201, + "language_loss": 0.99442279, + "learning_rate": 0.0007401537019902344, + "loss": 1.00688958, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.42871094, + "step": 42, + "time_per_iteration": 2.633784294128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227359, + "balance_loss_mlp": 1.0990901, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.10374794700504324, + "language_loss": 1.02897811, + "learning_rate": 0.0007448133392900729, + "loss": 1.04125178, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.28222656, + "step": 43, + "time_per_iteration": 2.7279117107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123262, + "balance_loss_mlp": 1.11207604, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.09096864884609944, + "language_loss": 0.98755985, + "learning_rate": 0.0007493658489441491, + "loss": 0.99988604, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.20410156, + "step": 44, + "time_per_iteration": 2.8941659927368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217323, + "balance_loss_mlp": 1.10812736, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.11598802445363406, + "language_loss": 1.0210619, + "learning_rate": 0.0007538160463002316, + "loss": 1.03323507, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.09375, + "step": 45, + "time_per_iteration": 2.7019526958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.11510944, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.06911757836433406, + "language_loss": 1.05356646, + "learning_rate": 0.0007581684291577274, + "loss": 1.06572652, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.00927734, + "step": 46, + "time_per_iteration": 2.5990471839904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209164, + "balance_loss_mlp": 1.11603808, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.09057982339816145, + "language_loss": 1.08819616, + "learning_rate": 0.0007624272050891776, + "loss": 1.10028791, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.93066406, + "step": 47, + "time_per_iteration": 2.8298892974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175031, + "balance_loss_mlp": 1.09315765, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06662076278867826, + "language_loss": 0.98563552, + "learning_rate": 0.0007665963158851307, + "loss": 0.99738586, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.81884766, + "step": 48, + "time_per_iteration": 2.840701103210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175621, + "balance_loss_mlp": 1.10109115, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07605871591802618, + "language_loss": 1.06984305, + "learning_rate": 0.0007706794594783609, + "loss": 1.08159924, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.74511719, + "step": 49, + "time_per_iteration": 2.7622482776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171173, + "balance_loss_mlp": 1.10093522, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.048657988043197084, + "language_loss": 1.05961394, + "learning_rate": 0.0007746801096530423, + "loss": 1.07132566, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.703125, + "step": 50, + "time_per_iteration": 2.768888473510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173086, + "balance_loss_mlp": 1.10890365, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.10082721582881933, + "language_loss": 1.10655856, + "learning_rate": 0.0007786015338021173, + "loss": 1.11828947, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.64160156, + "step": 51, + "time_per_iteration": 2.6473164558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155225, + "balance_loss_mlp": 1.09590614, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0966315307988203, + "language_loss": 1.03207719, + "learning_rate": 0.0007824468089603051, + "loss": 1.04362941, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.59277344, + "step": 52, + "time_per_iteration": 2.6773018836975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011599, + "balance_loss_mlp": 1.10766244, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.060495503821074374, + "language_loss": 1.02858949, + "learning_rate": 0.0007862188363098669, + "loss": 1.04018843, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52319336, + "step": 53, + "time_per_iteration": 3.174023389816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150276, + "balance_loss_mlp": 1.10125709, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.08315393852704078, + "language_loss": 1.03287244, + "learning_rate": 0.0007899203543304438, + "loss": 1.04437518, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48974609, + "step": 54, + "time_per_iteration": 2.7804617881774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158937, + "balance_loss_mlp": 1.11192107, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.13140028768126893, + "language_loss": 1.16694331, + "learning_rate": 0.0007935539507422731, + "loss": 1.1785326, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.47021484, + "step": 55, + "time_per_iteration": 2.6466386318206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137235, + "balance_loss_mlp": 1.09496331, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.06179334078386534, + "language_loss": 1.08511901, + "learning_rate": 0.0007971220733732573, + "loss": 1.09649134, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42285156, + "step": 56, + "time_per_iteration": 2.7039074897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138761, + "balance_loss_mlp": 1.10166252, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08220293288244152, + "language_loss": 1.03500617, + "learning_rate": 0.0008006270400641869, + "loss": 1.04639375, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.37084961, + "step": 57, + "time_per_iteration": 2.7175657749176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113224, + "balance_loss_mlp": 1.0981698, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.07093854356308794, + "language_loss": 1.04580712, + "learning_rate": 0.0008040710477125043, + "loss": 1.0571295, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.34106445, + "step": 58, + "time_per_iteration": 2.7424120903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135189, + "balance_loss_mlp": 1.10312176, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.07916475402820797, + "language_loss": 1.05395138, + "learning_rate": 0.0008074561805429771, + "loss": 1.06530333, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.32055664, + "step": 59, + "time_per_iteration": 2.7407617568969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130551, + "balance_loss_mlp": 1.10155916, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.04727062297709066, + "language_loss": 1.03273892, + "learning_rate": 0.0008107844176832545, + "loss": 1.04404449, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.28979492, + "step": 60, + "time_per_iteration": 2.6854803562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141191, + "balance_loss_mlp": 1.11353481, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.0952762711916136, + "language_loss": 1.04648042, + "learning_rate": 0.0008140576401132568, + "loss": 1.05789232, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.27685547, + "step": 61, + "time_per_iteration": 2.6589457988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137564, + "balance_loss_mlp": 1.11303091, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.07958403959823916, + "language_loss": 1.06014252, + "learning_rate": 0.0008172776370494935, + "loss": 1.07151818, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.24536133, + "step": 62, + "time_per_iteration": 2.768505334854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112985, + "balance_loss_mlp": 1.10548401, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.09183855716602674, + "language_loss": 1.12897038, + "learning_rate": 0.0008204461118185703, + "loss": 1.14026892, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.24353027, + "step": 63, + "time_per_iteration": 2.5573627948760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130628, + "balance_loss_mlp": 1.10793018, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.09747713298414284, + "language_loss": 1.02471447, + "learning_rate": 0.0008235646872681536, + "loss": 1.03602076, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22692871, + "step": 64, + "time_per_iteration": 2.585127353668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127769, + "balance_loss_mlp": 1.10554826, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.10571288349070412, + "language_loss": 1.02039421, + "learning_rate": 0.0008266349107584288, + "loss": 1.03167176, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2220459, + "step": 65, + "time_per_iteration": 2.703620433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140513, + "balance_loss_mlp": 1.11872149, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.14637461076762864, + "language_loss": 1.05036354, + "learning_rate": 0.0008296582587724851, + "loss": 1.06176865, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21801758, + "step": 66, + "time_per_iteration": 2.728839159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121296, + "balance_loss_mlp": 1.09962404, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.10157918798152736, + "language_loss": 1.03485751, + "learning_rate": 0.0008326361411800136, + "loss": 1.04607058, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21704102, + "step": 67, + "time_per_iteration": 2.963634729385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119673, + "balance_loss_mlp": 1.09863222, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.046087363126128704, + "language_loss": 1.03369427, + "learning_rate": 0.0008355699051851403, + "loss": 1.044891, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.21057129, + "step": 68, + "time_per_iteration": 2.7779767513275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146637, + "balance_loss_mlp": 1.12541735, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.10078437623262682, + "language_loss": 1.10584092, + "learning_rate": 0.0008384608389860635, + "loss": 1.11730719, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.21228027, + "step": 69, + "time_per_iteration": 2.72163724899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158467, + "balance_loss_mlp": 1.13795137, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.07269171982270876, + "language_loss": 1.00728607, + "learning_rate": 0.000841310175171381, + "loss": 1.01887083, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.2052002, + "step": 70, + "time_per_iteration": 2.653019666671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157048, + "balance_loss_mlp": 1.13693786, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.09340958478170322, + "language_loss": 0.98922431, + "learning_rate": 0.000844119093875517, + "loss": 1.00079489, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2010498, + "step": 71, + "time_per_iteration": 2.722351551055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152491, + "balance_loss_mlp": 1.13224936, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08018714642813927, + "language_loss": 1.04454517, + "learning_rate": 0.0008468887257134666, + "loss": 1.05607009, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.20239258, + "step": 72, + "time_per_iteration": 2.7619922161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134825, + "balance_loss_mlp": 1.11441696, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07872027680195416, + "language_loss": 1.06334233, + "learning_rate": 0.0008496201545131264, + "loss": 1.07469058, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.20410156, + "step": 73, + "time_per_iteration": 2.7532896995544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135406, + "balance_loss_mlp": 1.11529493, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.07696506497764126, + "language_loss": 1.03964853, + "learning_rate": 0.0008523144198617317, + "loss": 1.0510025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.20092773, + "step": 74, + "time_per_iteration": 3.220428943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_mlp": 1.11140466, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.08624589903961616, + "language_loss": 1.03597379, + "learning_rate": 0.0008549725194813783, + "loss": 1.04729605, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20825195, + "step": 75, + "time_per_iteration": 2.6929681301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126542, + "balance_loss_mlp": 1.1071701, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.1408086440247197, + "language_loss": 1.02827942, + "learning_rate": 0.0008575954114472099, + "loss": 1.03954494, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.19360352, + "step": 76, + "time_per_iteration": 3.1799752712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139543, + "balance_loss_mlp": 1.12005258, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.07592994584910524, + "language_loss": 1.00451732, + "learning_rate": 0.0008601840162606118, + "loss": 1.01591277, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.19470215, + "step": 77, + "time_per_iteration": 3.0833282470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138555, + "balance_loss_mlp": 1.11827779, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.08431129228371863, + "language_loss": 1.0643971, + "learning_rate": 0.000862739218788641, + "loss": 1.07578266, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.20275879, + "step": 78, + "time_per_iteration": 2.8568053245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141293, + "balance_loss_mlp": 1.121135, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.11686764405791189, + "language_loss": 1.04346561, + "learning_rate": 0.0008652618700799138, + "loss": 1.05487859, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.20153809, + "step": 79, + "time_per_iteration": 2.6828417778015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144733, + "balance_loss_mlp": 1.12453914, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.10817734170715895, + "language_loss": 1.03413367, + "learning_rate": 0.0008677527890662774, + "loss": 1.0455811, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.2019043, + "step": 80, + "time_per_iteration": 2.4982268810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142756, + "balance_loss_mlp": 1.12232339, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.09792899658664883, + "language_loss": 1.04667735, + "learning_rate": 0.0008702127641587799, + "loss": 1.05810475, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20422363, + "step": 81, + "time_per_iteration": 2.7113406658172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136455, + "balance_loss_mlp": 1.11561751, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.10099710945407976, + "language_loss": 1.00204504, + "learning_rate": 0.0008726425547457192, + "loss": 1.01340961, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20825195, + "step": 82, + "time_per_iteration": 2.8304948806762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140166, + "balance_loss_mlp": 1.12054384, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.11260184265628481, + "language_loss": 0.99513066, + "learning_rate": 0.0008750428925998964, + "loss": 1.00653231, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.19604492, + "step": 83, + "time_per_iteration": 2.762498617172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147828, + "balance_loss_mlp": 1.12830114, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.1180796768296156, + "language_loss": 1.05058432, + "learning_rate": 0.0008774144832015932, + "loss": 1.06206274, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.19519043, + "step": 84, + "time_per_iteration": 2.749310255050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01971265, + "balance_loss_mlp": 1.95724583, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.4228509486674634, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7674557, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.140625, + "step": 85, + "time_per_iteration": 4.626708745956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137886, + "balance_loss_mlp": 1.11834753, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.09445954258913132, + "language_loss": 1.0054847, + "learning_rate": 0.0008820741205014318, + "loss": 1.01686358, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1953125, + "step": 86, + "time_per_iteration": 2.918696403503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145479, + "balance_loss_mlp": 1.12540436, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.14940142735495454, + "language_loss": 1.02554607, + "learning_rate": 0.0008843634575408404, + "loss": 1.03700089, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20068359, + "step": 87, + "time_per_iteration": 2.6972436904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140944, + "balance_loss_mlp": 1.12226439, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.07729580722494055, + "language_loss": 1.03912258, + "learning_rate": 0.0008866266301555082, + "loss": 1.0505321, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18676758, + "step": 88, + "time_per_iteration": 2.741374969482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164989, + "balance_loss_mlp": 1.14647579, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.12135336715529384, + "language_loss": 1.04746294, + "learning_rate": 0.0008888642296509615, + "loss": 1.05911291, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18493652, + "step": 89, + "time_per_iteration": 2.62099552154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183021, + "balance_loss_mlp": 1.16370893, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.13101998707614188, + "language_loss": 1.08785903, + "learning_rate": 0.0008910768275115906, + "loss": 1.09968925, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.19311523, + "step": 90, + "time_per_iteration": 2.819420099258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181615, + "balance_loss_mlp": 1.16215992, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.1050230223941115, + "language_loss": 1.04935551, + "learning_rate": 0.0008932649762767675, + "loss": 1.06117165, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19445801, + "step": 91, + "time_per_iteration": 2.622406244277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182279, + "balance_loss_mlp": 1.16277599, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.08683374673810437, + "language_loss": 1.07276869, + "learning_rate": 0.0008954292103690864, + "loss": 1.08459151, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.19494629, + "step": 92, + "time_per_iteration": 2.9198801517486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185402, + "balance_loss_mlp": 1.16578054, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.18507766534910622, + "language_loss": 1.0957979, + "learning_rate": 0.0008975700468778296, + "loss": 1.10765195, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19616699, + "step": 93, + "time_per_iteration": 2.6395699977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183129, + "balance_loss_mlp": 1.1639359, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.14308809926092464, + "language_loss": 1.0301311, + "learning_rate": 0.0008996879863005366, + "loss": 1.04196239, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.19189453, + "step": 94, + "time_per_iteration": 2.685325860977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192741, + "balance_loss_mlp": 1.17335784, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.08942422865693514, + "language_loss": 1.02994668, + "learning_rate": 0.0009017835132453337, + "loss": 1.04187417, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19360352, + "step": 95, + "time_per_iteration": 2.640179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.1659379, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.12775293798220247, + "language_loss": 1.03491902, + "learning_rate": 0.0009038570970964896, + "loss": 1.046772, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.19348145, + "step": 96, + "time_per_iteration": 2.8062894344329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153754, + "balance_loss_mlp": 1.13440657, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.07493462569205835, + "language_loss": 1.00418913, + "learning_rate": 0.0009059091926454854, + "loss": 1.01572669, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.1932373, + "step": 97, + "time_per_iteration": 2.625839948654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147507, + "balance_loss_mlp": 1.12845731, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09820444328466757, + "language_loss": 0.99835473, + "learning_rate": 0.0009079402406897198, + "loss": 1.00982976, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19042969, + "step": 98, + "time_per_iteration": 3.2515511512756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153758, + "balance_loss_mlp": 1.13449359, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.1057784840893083, + "language_loss": 1.01116824, + "learning_rate": 0.0009099506686008212, + "loss": 1.02270579, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19262695, + "step": 99, + "time_per_iteration": 2.8564164638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131688, + "balance_loss_mlp": 1.11337709, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.06422494393503501, + "language_loss": 1.04474521, + "learning_rate": 0.0009119408908644013, + "loss": 1.0560621, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.18310547, + "step": 100, + "time_per_iteration": 2.717921495437622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126129, + "balance_loss_mlp": 1.10765147, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.13157053780302536, + "language_loss": 1.09764636, + "learning_rate": 0.0009139113095929519, + "loss": 1.1089077, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18469238, + "step": 101, + "time_per_iteration": 2.8778345584869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.12801814, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.09138386946816152, + "language_loss": 1.03731561, + "learning_rate": 0.0009158623150134762, + "loss": 1.04879129, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19543457, + "step": 102, + "time_per_iteration": 2.588974952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127418, + "balance_loss_mlp": 1.10807002, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.09239208832300977, + "language_loss": 1.03516126, + "learning_rate": 0.000917794285931332, + "loss": 1.04643536, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.1932373, + "step": 103, + "time_per_iteration": 2.680100917816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126804, + "balance_loss_mlp": 1.10709858, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06521053042835766, + "language_loss": 0.95701432, + "learning_rate": 0.0009197075901716639, + "loss": 0.96828234, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.19689941, + "step": 104, + "time_per_iteration": 2.730409860610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154929, + "balance_loss_mlp": 1.13441312, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.1079045695165621, + "language_loss": 1.06002212, + "learning_rate": 0.0009216025849997171, + "loss": 1.07157135, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.2052002, + "step": 105, + "time_per_iteration": 2.8010010719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125545, + "balance_loss_mlp": 1.10562515, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.06774775888709755, + "language_loss": 1.00999045, + "learning_rate": 0.0009234796175212258, + "loss": 1.02124596, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.19909668, + "step": 106, + "time_per_iteration": 3.0094785690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.11433506, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.09956588263492473, + "language_loss": 1.04219186, + "learning_rate": 0.000925339025064007, + "loss": 1.05353272, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.1973877, + "step": 107, + "time_per_iteration": 2.9836714267730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112225, + "balance_loss_mlp": 1.1024735, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06168154311284234, + "language_loss": 0.97232246, + "learning_rate": 0.0009271811355418027, + "loss": 0.98354501, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19775391, + "step": 108, + "time_per_iteration": 2.860042095184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120623, + "balance_loss_mlp": 1.10089409, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.10451884090896614, + "language_loss": 1.03835416, + "learning_rate": 0.0009290062678013548, + "loss": 1.04956043, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19714355, + "step": 109, + "time_per_iteration": 2.8912689685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116129, + "balance_loss_mlp": 1.09641171, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.12087494450090952, + "language_loss": 1.02292705, + "learning_rate": 0.0009308147319536321, + "loss": 1.03408837, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19702148, + "step": 110, + "time_per_iteration": 2.682143449783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123774, + "balance_loss_mlp": 1.10437846, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.09468465669548881, + "language_loss": 1.08714509, + "learning_rate": 0.0009326068296900676, + "loss": 1.09838271, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.19372559, + "step": 111, + "time_per_iteration": 2.8420276641845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113573, + "balance_loss_mlp": 1.09368885, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06573635575260657, + "language_loss": 1.00160766, + "learning_rate": 0.0009343828545846161, + "loss": 1.01274335, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19873047, + "step": 112, + "time_per_iteration": 2.81919264793396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140317, + "balance_loss_mlp": 1.1205641, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.10387186502959084, + "language_loss": 1.03632593, + "learning_rate": 0.0009361430923823841, + "loss": 1.04772925, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1973877, + "step": 113, + "time_per_iteration": 2.6119744777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125249, + "balance_loss_mlp": 1.1051383, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.07902170601711563, + "language_loss": 1.07192981, + "learning_rate": 0.0009378878212755459, + "loss": 1.08318233, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.2010498, + "step": 114, + "time_per_iteration": 2.511798143386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121266, + "balance_loss_mlp": 1.10053515, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.07803352047235128, + "language_loss": 0.97866738, + "learning_rate": 0.0009396173121672103, + "loss": 0.98988008, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20739746, + "step": 115, + "time_per_iteration": 2.664508819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129328, + "balance_loss_mlp": 1.10866928, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.0856857268925464, + "language_loss": 1.03136635, + "learning_rate": 0.0009413318289238633, + "loss": 1.04265964, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20666504, + "step": 116, + "time_per_iteration": 2.78078031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107565, + "balance_loss_mlp": 1.08696532, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.07931864844683259, + "language_loss": 0.9541564, + "learning_rate": 0.0009430316286169771, + "loss": 0.96523207, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20605469, + "step": 117, + "time_per_iteration": 3.034813404083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162526, + "balance_loss_mlp": 1.14062762, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.10907247817659571, + "language_loss": 1.00617993, + "learning_rate": 0.0009447169617543361, + "loss": 1.0178051, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.21899414, + "step": 118, + "time_per_iteration": 2.6340808868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173058, + "balance_loss_mlp": 1.15192246, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.12286397781369558, + "language_loss": 1.06791735, + "learning_rate": 0.0009463880725016029, + "loss": 1.0796479, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.21142578, + "step": 119, + "time_per_iteration": 2.7167999744415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112992, + "balance_loss_mlp": 1.10922527, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.1818455397825579, + "language_loss": 1.0306797, + "learning_rate": 0.0009480451988946134, + "loss": 1.04197884, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.20703125, + "step": 120, + "time_per_iteration": 2.8320834636688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127416, + "balance_loss_mlp": 1.10706663, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.061341545621049966, + "language_loss": 1.03699958, + "learning_rate": 0.0009496885730428627, + "loss": 1.04827368, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.20349121, + "step": 121, + "time_per_iteration": 3.0393545627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141944, + "balance_loss_mlp": 1.12239408, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.12547580017264032, + "language_loss": 1.01912796, + "learning_rate": 0.0009513184213246156, + "loss": 1.0305475, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19543457, + "step": 122, + "time_per_iteration": 2.651719093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162278, + "balance_loss_mlp": 1.14191747, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.1065762842559702, + "language_loss": 1.05289114, + "learning_rate": 0.0009529349645740552, + "loss": 1.06451392, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20361328, + "step": 123, + "time_per_iteration": 2.705214262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165232, + "balance_loss_mlp": 1.14444137, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.12380591024493681, + "language_loss": 1.04425788, + "learning_rate": 0.0009545384182608524, + "loss": 1.05591035, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.2076416, + "step": 124, + "time_per_iteration": 2.544631004333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143364, + "balance_loss_mlp": 1.12262154, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.07613482272455964, + "language_loss": 1.01444972, + "learning_rate": 0.0009561289926625252, + "loss": 1.0258832, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20739746, + "step": 125, + "time_per_iteration": 2.6732449531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140968, + "balance_loss_mlp": 1.11927211, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.13062804118014867, + "language_loss": 1.05952811, + "learning_rate": 0.0009577068930299292, + "loss": 1.07093775, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.21691895, + "step": 126, + "time_per_iteration": 2.5860514640808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111442, + "balance_loss_mlp": 1.09249783, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.11550485665133546, + "language_loss": 1.01208651, + "learning_rate": 0.0009592723197462087, + "loss": 1.02323079, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.21923828, + "step": 127, + "time_per_iteration": 2.680792808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139077, + "balance_loss_mlp": 1.1162957, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07531268866570652, + "language_loss": 0.98376709, + "learning_rate": 0.0009608254684795125, + "loss": 0.99515784, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.2277832, + "step": 128, + "time_per_iteration": 2.962553024291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151675, + "balance_loss_mlp": 1.12746358, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.10067493874109901, + "language_loss": 1.01099372, + "learning_rate": 0.0009623665303297678, + "loss": 1.02251053, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.24206543, + "step": 129, + "time_per_iteration": 2.7238845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178581, + "balance_loss_mlp": 1.1552279, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.11648498824832396, + "language_loss": 1.04954159, + "learning_rate": 0.0009638956919697878, + "loss": 1.06132734, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.23352051, + "step": 130, + "time_per_iteration": 2.878931999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180963, + "balance_loss_mlp": 1.15737128, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07835178368021106, + "language_loss": 0.97041726, + "learning_rate": 0.0009654131357809714, + "loss": 0.98222685, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.23596191, + "step": 131, + "time_per_iteration": 2.646268367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187479, + "balance_loss_mlp": 1.1633389, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.08592709100669786, + "language_loss": 1.06445599, + "learning_rate": 0.0009669190399838441, + "loss": 1.07633078, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.24169922, + "step": 132, + "time_per_iteration": 3.1253442764282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178278, + "balance_loss_mlp": 1.15288627, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.06433616224475917, + "language_loss": 0.99044776, + "learning_rate": 0.0009684135787636724, + "loss": 1.00223053, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.25402832, + "step": 133, + "time_per_iteration": 2.831838846206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193803, + "balance_loss_mlp": 1.16735041, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.10671106752503096, + "language_loss": 1.03402495, + "learning_rate": 0.0009698969223913726, + "loss": 1.04596305, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.26452637, + "step": 134, + "time_per_iteration": 3.0395402908325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167127, + "balance_loss_mlp": 1.14202118, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.08439936893253437, + "language_loss": 1.06654739, + "learning_rate": 0.0009713692373399265, + "loss": 1.0782187, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.25109863, + "step": 135, + "time_per_iteration": 2.7715206146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463238, + "balance_loss_mlp": 1.43119502, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.13141202298162255, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80919468, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.3203125, + "step": 136, + "time_per_iteration": 5.708434820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01366397, + "balance_loss_mlp": 1.3391223, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.10789098637796743, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79177433, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.2734375, + "step": 137, + "time_per_iteration": 4.936312198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164239, + "balance_loss_mlp": 1.14192283, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.07737815407023008, + "language_loss": 0.99685001, + "learning_rate": 0.0009757216201974225, + "loss": 1.00849247, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.22338867, + "step": 138, + "time_per_iteration": 2.848794460296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186286, + "balance_loss_mlp": 1.16373122, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.07356199280990307, + "language_loss": 1.04477906, + "learning_rate": 0.0009771514130396581, + "loss": 1.05664206, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.22546387, + "step": 139, + "time_per_iteration": 2.735100746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191574, + "balance_loss_mlp": 1.17103469, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09793912671864533, + "language_loss": 1.04422235, + "learning_rate": 0.00097857095638274, + "loss": 1.05613816, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20544434, + "step": 140, + "time_per_iteration": 2.6398932933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187088, + "balance_loss_mlp": 1.16559434, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.08846308668893199, + "language_loss": 0.95874435, + "learning_rate": 0.0009799803961288726, + "loss": 0.97061527, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.21484375, + "step": 141, + "time_per_iteration": 3.0505003929138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160228, + "balance_loss_mlp": 1.13921118, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.09598553540990232, + "language_loss": 1.0168581, + "learning_rate": 0.000981379875086876, + "loss": 1.02846038, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.21020508, + "step": 142, + "time_per_iteration": 3.0870697498321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143434, + "balance_loss_mlp": 1.12091553, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08800286540083159, + "language_loss": 0.96917391, + "learning_rate": 0.0009827695330590185, + "loss": 0.98060828, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.2253418, + "step": 143, + "time_per_iteration": 2.719317674636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128989, + "balance_loss_mlp": 1.10631514, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09792527853337853, + "language_loss": 0.96426451, + "learning_rate": 0.0009841495069248256, + "loss": 0.97555441, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.22692871, + "step": 144, + "time_per_iteration": 3.014765739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011276, + "balance_loss_mlp": 1.10584438, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06966533855263184, + "language_loss": 0.95713264, + "learning_rate": 0.0009855199307219871, + "loss": 0.9684087, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.21777344, + "step": 145, + "time_per_iteration": 2.6709253787994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148016, + "balance_loss_mlp": 1.12558043, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.09436929899226476, + "language_loss": 0.97337723, + "learning_rate": 0.0009868809357244854, + "loss": 0.98485744, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.22424316, + "step": 146, + "time_per_iteration": 2.669283390045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119223, + "balance_loss_mlp": 1.09726429, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.0790038702862921, + "language_loss": 1.01669443, + "learning_rate": 0.0009882326505180556, + "loss": 1.02788651, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21948242, + "step": 147, + "time_per_iteration": 2.704292058944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138866, + "balance_loss_mlp": 1.11706281, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.10005424592603226, + "language_loss": 0.99863935, + "learning_rate": 0.0009895752010730906, + "loss": 1.010028, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21801758, + "step": 148, + "time_per_iteration": 2.9581809043884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113837, + "balance_loss_mlp": 1.09122324, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.059614583623975884, + "language_loss": 1.06015503, + "learning_rate": 0.0009909087108150867, + "loss": 1.07129347, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.22619629, + "step": 149, + "time_per_iteration": 2.741159200668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121459, + "balance_loss_mlp": 1.09761691, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.11202611832617501, + "language_loss": 1.06769323, + "learning_rate": 0.0009922333006927371, + "loss": 1.07890773, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.23852539, + "step": 150, + "time_per_iteration": 2.4982028007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130563, + "balance_loss_mlp": 1.10591054, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07605307561327067, + "language_loss": 1.00263429, + "learning_rate": 0.0009935490892437632, + "loss": 1.01393986, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.2467041, + "step": 151, + "time_per_iteration": 2.603449583053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144852, + "balance_loss_mlp": 1.12064028, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10272840367827417, + "language_loss": 0.98558784, + "learning_rate": 0.0009948561926585687, + "loss": 0.99703634, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.24206543, + "step": 152, + "time_per_iteration": 2.8270881175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152248, + "balance_loss_mlp": 1.12610579, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09161667027770293, + "language_loss": 1.02430511, + "learning_rate": 0.0009961547248418122, + "loss": 1.03582752, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.26159668, + "step": 153, + "time_per_iteration": 2.6539955139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145984, + "balance_loss_mlp": 1.11949599, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.12801957864517624, + "language_loss": 0.99122071, + "learning_rate": 0.0009974447974719707, + "loss": 1.00268054, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.26477051, + "step": 154, + "time_per_iteration": 2.7382068634033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149383, + "balance_loss_mlp": 1.12209582, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.08800353648973465, + "language_loss": 1.01358569, + "learning_rate": 0.0009987265200589763, + "loss": 1.02507949, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.27307129, + "step": 155, + "time_per_iteration": 2.7484042644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146509, + "balance_loss_mlp": 1.11942446, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.06940376161599653, + "language_loss": 1.00859666, + "learning_rate": 0.001, + "loss": 1.02006161, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.27124023, + "step": 156, + "time_per_iteration": 2.9298081398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143379, + "balance_loss_mlp": 1.11696208, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07290625745558146, + "language_loss": 0.98239183, + "learning_rate": 0.0009999999029413921, + "loss": 0.99382555, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2644043, + "step": 157, + "time_per_iteration": 2.8521509170532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143494, + "balance_loss_mlp": 1.11851931, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.10227720632383495, + "language_loss": 0.99759698, + "learning_rate": 0.0009999996117656068, + "loss": 1.00903201, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.24975586, + "step": 158, + "time_per_iteration": 2.7299323081970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132124, + "balance_loss_mlp": 1.10562384, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.10099325970079884, + "language_loss": 0.93055141, + "learning_rate": 0.0009999991264727564, + "loss": 0.94187272, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.26489258, + "step": 159, + "time_per_iteration": 2.838892698287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115378, + "balance_loss_mlp": 1.08908033, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.07019569540009855, + "language_loss": 1.04060161, + "learning_rate": 0.0009999984470630296, + "loss": 1.05175543, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.26330566, + "step": 160, + "time_per_iteration": 2.6468522548675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127141, + "balance_loss_mlp": 1.09948444, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.11009923170951091, + "language_loss": 0.93022841, + "learning_rate": 0.0009999975735366902, + "loss": 0.94149983, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.27636719, + "step": 161, + "time_per_iteration": 3.0944836139678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113017, + "balance_loss_mlp": 1.10256159, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.1021195191465028, + "language_loss": 0.94580781, + "learning_rate": 0.0009999965058940775, + "loss": 0.95710957, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.27624512, + "step": 162, + "time_per_iteration": 3.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140975, + "balance_loss_mlp": 1.11293721, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.10168672994339449, + "language_loss": 1.00657988, + "learning_rate": 0.0009999952441356057, + "loss": 1.01798964, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.28027344, + "step": 163, + "time_per_iteration": 2.5260584354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117443, + "balance_loss_mlp": 1.09220648, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.059509842301402785, + "language_loss": 1.01277101, + "learning_rate": 0.000999993788261765, + "loss": 1.02394545, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.25231934, + "step": 164, + "time_per_iteration": 3.585451126098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117278, + "balance_loss_mlp": 1.09226811, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.08345282656088489, + "language_loss": 1.02586234, + "learning_rate": 0.00099999213827312, + "loss": 1.03703511, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.25036621, + "step": 165, + "time_per_iteration": 2.815709352493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126749, + "balance_loss_mlp": 1.10315728, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.06906572503593703, + "language_loss": 0.97404492, + "learning_rate": 0.000999990294170312, + "loss": 0.98531234, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.23596191, + "step": 166, + "time_per_iteration": 2.663247585296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123432, + "balance_loss_mlp": 1.09961414, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06114993163800343, + "language_loss": 1.01775765, + "learning_rate": 0.0009999882559540566, + "loss": 1.02899194, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.23803711, + "step": 167, + "time_per_iteration": 2.6779284477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113429, + "balance_loss_mlp": 1.08983719, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.051224230506458926, + "language_loss": 0.98247135, + "learning_rate": 0.000999986023625145, + "loss": 0.99360555, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23571777, + "step": 168, + "time_per_iteration": 2.8207764625549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02968107, + "balance_loss_mlp": 2.92347527, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.42377736764400964, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.81892526, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.44726562, + "step": 169, + "time_per_iteration": 5.030913591384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110197, + "balance_loss_mlp": 1.08739257, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11749522299339567, + "language_loss": 0.99391603, + "learning_rate": 0.0009999809766328958, + "loss": 1.005018, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.22839355, + "step": 170, + "time_per_iteration": 2.7288546562194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138899, + "balance_loss_mlp": 1.11526036, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.10262090882217431, + "language_loss": 1.01489758, + "learning_rate": 0.0009999781619715177, + "loss": 1.02628672, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23620605, + "step": 171, + "time_per_iteration": 2.57743239402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152618, + "balance_loss_mlp": 1.12847793, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.09929883602390663, + "language_loss": 1.00886559, + "learning_rate": 0.000999975153201402, + "loss": 1.0203917, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.24121094, + "step": 172, + "time_per_iteration": 2.8398427963256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164032, + "balance_loss_mlp": 1.13917661, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.07630899187603161, + "language_loss": 0.98461914, + "learning_rate": 0.0009999719503237174, + "loss": 0.99625951, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.24865723, + "step": 173, + "time_per_iteration": 2.7653093338012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119131, + "balance_loss_mlp": 1.16379607, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11225996182460839, + "language_loss": 1.07204938, + "learning_rate": 0.0009999685533397073, + "loss": 1.08396256, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.27514648, + "step": 174, + "time_per_iteration": 2.560985565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174784, + "balance_loss_mlp": 1.14617324, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.09969766363227954, + "language_loss": 0.99471402, + "learning_rate": 0.00099996496225069, + "loss": 1.00646186, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.28637695, + "step": 175, + "time_per_iteration": 2.685511589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191936, + "balance_loss_mlp": 1.16053653, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07081110533815538, + "language_loss": 1.01830065, + "learning_rate": 0.0009999611770580604, + "loss": 1.03022003, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.31396484, + "step": 176, + "time_per_iteration": 2.848646879196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184826, + "balance_loss_mlp": 1.1498735, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08630072774372038, + "language_loss": 1.00571251, + "learning_rate": 0.0009999571977632876, + "loss": 1.01756072, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.34960938, + "step": 177, + "time_per_iteration": 2.5936646461486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183799, + "balance_loss_mlp": 1.14896631, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.14796843181278477, + "language_loss": 1.03395152, + "learning_rate": 0.0009999530243679166, + "loss": 1.04578948, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.34863281, + "step": 178, + "time_per_iteration": 2.578585386276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119148, + "balance_loss_mlp": 1.15502596, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.07456630143082679, + "language_loss": 0.98466933, + "learning_rate": 0.0009999486568735675, + "loss": 0.99658418, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.36450195, + "step": 179, + "time_per_iteration": 3.0958807468414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204458, + "balance_loss_mlp": 1.16657281, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.1071704794457763, + "language_loss": 0.98888862, + "learning_rate": 0.0009999440952819362, + "loss": 1.00093329, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.37841797, + "step": 180, + "time_per_iteration": 3.7027652263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119721, + "balance_loss_mlp": 1.1615665, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.09808227157719927, + "language_loss": 0.98941529, + "learning_rate": 0.0009999393395947935, + "loss": 1.00138736, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.35644531, + "step": 181, + "time_per_iteration": 2.9549217224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178907, + "balance_loss_mlp": 1.1453855, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.07390299993950959, + "language_loss": 1.01616848, + "learning_rate": 0.0009999343898139858, + "loss": 1.02795744, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.33520508, + "step": 182, + "time_per_iteration": 2.6392982006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183604, + "balance_loss_mlp": 1.15117884, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.07686941510438546, + "language_loss": 1.00897217, + "learning_rate": 0.0009999292459414348, + "loss": 1.02080822, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.32397461, + "step": 183, + "time_per_iteration": 2.657658338546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158427, + "balance_loss_mlp": 1.12702751, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08111160194171327, + "language_loss": 1.04917085, + "learning_rate": 0.0009999239079791374, + "loss": 1.06075525, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.3137207, + "step": 184, + "time_per_iteration": 2.631359815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115468, + "balance_loss_mlp": 1.12359011, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.06813164935019152, + "language_loss": 0.98483247, + "learning_rate": 0.0009999183759291659, + "loss": 0.99637926, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.31054688, + "step": 185, + "time_per_iteration": 2.7329554557800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133475, + "balance_loss_mlp": 1.10393476, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.1084122935019402, + "language_loss": 1.00212467, + "learning_rate": 0.0009999126497936682, + "loss": 1.01345944, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.29516602, + "step": 186, + "time_per_iteration": 2.5334415435791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110251, + "balance_loss_mlp": 1.08080626, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057007065611444814, + "language_loss": 1.03274298, + "learning_rate": 0.0009999067295748676, + "loss": 1.04384542, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.29443359, + "step": 187, + "time_per_iteration": 2.8514976501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120219, + "balance_loss_mlp": 1.09280062, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.1063888726335035, + "language_loss": 1.00729585, + "learning_rate": 0.000999900615275062, + "loss": 1.01849806, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.27441406, + "step": 188, + "time_per_iteration": 2.7070038318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115411, + "balance_loss_mlp": 1.08773041, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.10114859104821755, + "language_loss": 1.06676006, + "learning_rate": 0.0009998943068966256, + "loss": 1.07791412, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.27709961, + "step": 189, + "time_per_iteration": 2.459259271621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128247, + "balance_loss_mlp": 1.0989449, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.09267057508255847, + "language_loss": 1.01174653, + "learning_rate": 0.0009998878044420072, + "loss": 1.02302897, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.29296875, + "step": 190, + "time_per_iteration": 2.710602045059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128613, + "balance_loss_mlp": 1.09881067, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06756260422642338, + "language_loss": 0.9758327, + "learning_rate": 0.0009998811079137318, + "loss": 0.98711884, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.2980957, + "step": 191, + "time_per_iteration": 2.657074451446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144449, + "balance_loss_mlp": 1.11092758, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.08238379115749897, + "language_loss": 0.98845601, + "learning_rate": 0.0009998742173143987, + "loss": 0.99990052, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.33544922, + "step": 192, + "time_per_iteration": 2.637148857116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155548, + "balance_loss_mlp": 1.12164509, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.08708605999438765, + "language_loss": 0.98628879, + "learning_rate": 0.0009998671326466833, + "loss": 0.99784422, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.33911133, + "step": 193, + "time_per_iteration": 3.0115370750427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152209, + "balance_loss_mlp": 1.11556399, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.10177169507488108, + "language_loss": 0.98986697, + "learning_rate": 0.0009998598539133362, + "loss": 1.00138903, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.36645508, + "step": 194, + "time_per_iteration": 3.144454002380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161775, + "balance_loss_mlp": 1.12694228, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.0772667916065631, + "language_loss": 1.00733018, + "learning_rate": 0.0009998523811171828, + "loss": 1.01894796, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.34863281, + "step": 195, + "time_per_iteration": 2.577711820602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158404, + "balance_loss_mlp": 1.12314177, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.12690614805983907, + "language_loss": 1.00355625, + "learning_rate": 0.0009998447142611248, + "loss": 1.0151403, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.3527832, + "step": 196, + "time_per_iteration": 2.690129041671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157374, + "balance_loss_mlp": 1.12332833, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.06577024943575122, + "language_loss": 0.94151276, + "learning_rate": 0.0009998368533481387, + "loss": 0.9530865, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.34057617, + "step": 197, + "time_per_iteration": 3.045903444290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135441, + "balance_loss_mlp": 1.10120416, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.07117988238902957, + "language_loss": 0.9709003, + "learning_rate": 0.0009998287983812762, + "loss": 0.98225474, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.34277344, + "step": 198, + "time_per_iteration": 2.8663957118988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153893, + "balance_loss_mlp": 1.11910725, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08607783918573575, + "language_loss": 1.02875066, + "learning_rate": 0.0009998205493636646, + "loss": 1.04028964, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.34790039, + "step": 199, + "time_per_iteration": 2.6874265670776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113429, + "balance_loss_mlp": 1.10010099, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.06925160872633124, + "language_loss": 0.95776969, + "learning_rate": 0.0009998121062985063, + "loss": 0.96911263, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.34179688, + "step": 200, + "time_per_iteration": 2.7165024280548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137597, + "balance_loss_mlp": 1.10424268, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.05789149863906192, + "language_loss": 0.98006374, + "learning_rate": 0.0009998034691890794, + "loss": 0.9914397, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.33349609, + "step": 201, + "time_per_iteration": 2.8032913208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122408, + "balance_loss_mlp": 1.09148479, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07027358299059557, + "language_loss": 1.02082264, + "learning_rate": 0.0009997946380387369, + "loss": 1.03204679, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.30932617, + "step": 202, + "time_per_iteration": 2.6880364418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_mlp": 1.08206952, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.06026816631059916, + "language_loss": 1.0439496, + "learning_rate": 0.0009997856128509076, + "loss": 1.05508685, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.31665039, + "step": 203, + "time_per_iteration": 2.8704147338867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120628, + "balance_loss_mlp": 1.089324, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.09379084264769941, + "language_loss": 0.99581945, + "learning_rate": 0.0009997763936290952, + "loss": 1.00702572, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.31298828, + "step": 204, + "time_per_iteration": 2.527740478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131571, + "balance_loss_mlp": 1.09924173, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09654929574768753, + "language_loss": 1.03863358, + "learning_rate": 0.0009997669803768789, + "loss": 1.04994941, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.32324219, + "step": 205, + "time_per_iteration": 2.7987287044525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114569, + "balance_loss_mlp": 1.08383656, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07990731679878747, + "language_loss": 0.99632657, + "learning_rate": 0.0009997573730979134, + "loss": 1.00747228, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.30712891, + "step": 206, + "time_per_iteration": 2.73876953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03108122, + "balance_loss_mlp": 2.9547708, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.40060181244225235, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82301319, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.53125, + "step": 207, + "time_per_iteration": 4.722966432571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165141, + "balance_loss_mlp": 1.13169098, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.10630094281595456, + "language_loss": 0.98190439, + "learning_rate": 0.0009997375764747294, + "loss": 0.99355578, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.33447266, + "step": 208, + "time_per_iteration": 3.063753128051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176428, + "balance_loss_mlp": 1.14395499, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.08070097632442315, + "language_loss": 0.96488065, + "learning_rate": 0.0009997273871381967, + "loss": 0.97664487, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.32470703, + "step": 209, + "time_per_iteration": 2.738070249557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199702, + "balance_loss_mlp": 1.16675293, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07976675940517855, + "language_loss": 1.01156783, + "learning_rate": 0.0009997170037902862, + "loss": 1.02356482, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.32958984, + "step": 210, + "time_per_iteration": 2.7301113605499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207423, + "balance_loss_mlp": 1.17323339, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.10146454126791286, + "language_loss": 1.03771198, + "learning_rate": 0.0009997064264350292, + "loss": 1.04978609, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.34228516, + "step": 211, + "time_per_iteration": 2.8760437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199912, + "balance_loss_mlp": 1.16364872, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07215586978638981, + "language_loss": 0.9769634, + "learning_rate": 0.0009996956550765317, + "loss": 0.98896253, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.36254883, + "step": 212, + "time_per_iteration": 2.704005479812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209491, + "balance_loss_mlp": 1.17389536, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07781252726849613, + "language_loss": 0.9221555, + "learning_rate": 0.0009996846897189762, + "loss": 0.93425035, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.35595703, + "step": 213, + "time_per_iteration": 2.6465373039245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209061, + "balance_loss_mlp": 1.17510998, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.09713937665314668, + "language_loss": 0.99115217, + "learning_rate": 0.0009996735303666193, + "loss": 1.00324273, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.33984375, + "step": 214, + "time_per_iteration": 2.7262256145477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217734, + "balance_loss_mlp": 1.18261504, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.0828033847449013, + "language_loss": 1.01114583, + "learning_rate": 0.0009996621770237937, + "loss": 1.02332306, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.35131836, + "step": 215, + "time_per_iteration": 2.774261951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228156, + "balance_loss_mlp": 1.19122505, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.09368483866206018, + "language_loss": 0.9696883, + "learning_rate": 0.0009996506296949073, + "loss": 0.98196977, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.36889648, + "step": 216, + "time_per_iteration": 2.9090492725372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227468, + "balance_loss_mlp": 1.18944013, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.07734539448931728, + "language_loss": 0.9667756, + "learning_rate": 0.0009996388883844428, + "loss": 0.97905028, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.38037109, + "step": 217, + "time_per_iteration": 2.6576592922210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219854, + "balance_loss_mlp": 1.18299437, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.05044055802439308, + "language_loss": 1.01232481, + "learning_rate": 0.0009996269530969588, + "loss": 1.02452338, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.36865234, + "step": 218, + "time_per_iteration": 2.5997114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212901, + "balance_loss_mlp": 1.17787719, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.09536649242864963, + "language_loss": 0.99537694, + "learning_rate": 0.0009996148238370888, + "loss": 1.00750601, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.3503418, + "step": 219, + "time_per_iteration": 2.794192314147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210779, + "balance_loss_mlp": 1.17465854, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.05253448416537987, + "language_loss": 0.95164675, + "learning_rate": 0.0009996025006095421, + "loss": 0.96375448, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.36132812, + "step": 220, + "time_per_iteration": 3.387816905975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03435379, + "balance_loss_mlp": 3.13935852, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.2631843872282414, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81218523, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.953125, + "step": 221, + "time_per_iteration": 5.1907196044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198698, + "balance_loss_mlp": 1.16422272, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.084470461812628, + "language_loss": 0.96455717, + "learning_rate": 0.0009995772722706307, + "loss": 0.97654414, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.3449707, + "step": 222, + "time_per_iteration": 2.817683219909668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196045, + "balance_loss_mlp": 1.16013885, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.09039049489351958, + "language_loss": 1.09978271, + "learning_rate": 0.0009995643671690604, + "loss": 1.11174321, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.35888672, + "step": 223, + "time_per_iteration": 2.473952293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187658, + "balance_loss_mlp": 1.15511417, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.098631665550633, + "language_loss": 0.9726367, + "learning_rate": 0.0009995512681194023, + "loss": 0.98451328, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.32543945, + "step": 224, + "time_per_iteration": 2.8320233821868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173941, + "balance_loss_mlp": 1.14256525, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.09492392354161142, + "language_loss": 0.95751745, + "learning_rate": 0.0009995379751267417, + "loss": 0.96925682, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.31347656, + "step": 225, + "time_per_iteration": 3.265004873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_mlp": 1.11923385, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.07692691631212083, + "language_loss": 0.96905231, + "learning_rate": 0.0009995244881962398, + "loss": 0.98056507, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.3203125, + "step": 226, + "time_per_iteration": 2.6380093097686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122529, + "balance_loss_mlp": 1.09217834, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.1280080940779162, + "language_loss": 0.97453952, + "learning_rate": 0.0009995108073331323, + "loss": 0.98576486, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.30322266, + "step": 227, + "time_per_iteration": 2.611384630203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116442, + "balance_loss_mlp": 1.08482742, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05834750559212819, + "language_loss": 1.00860834, + "learning_rate": 0.0009994969325427309, + "loss": 1.01977265, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.31640625, + "step": 228, + "time_per_iteration": 2.690300941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123233, + "balance_loss_mlp": 1.08851922, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.06096273128167382, + "language_loss": 0.96635395, + "learning_rate": 0.0009994828638304218, + "loss": 0.97758633, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.34716797, + "step": 229, + "time_per_iteration": 2.666841506958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128717, + "balance_loss_mlp": 1.093979, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.08326095283531681, + "language_loss": 1.02012706, + "learning_rate": 0.0009994686012016675, + "loss": 1.03141427, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.34765625, + "step": 230, + "time_per_iteration": 2.5846869945526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153704, + "balance_loss_mlp": 1.12056351, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.09069798767816209, + "language_loss": 1.01831698, + "learning_rate": 0.000999454144662005, + "loss": 1.02985406, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.33154297, + "step": 231, + "time_per_iteration": 2.923693895339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115334, + "balance_loss_mlp": 1.11736226, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.09055690768180072, + "language_loss": 0.95778871, + "learning_rate": 0.0009994394942170468, + "loss": 0.9693222, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.35961914, + "step": 232, + "time_per_iteration": 2.7030160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142015, + "balance_loss_mlp": 1.10806465, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.058800323500690845, + "language_loss": 0.93958372, + "learning_rate": 0.0009994246498724808, + "loss": 0.95100385, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33984375, + "step": 233, + "time_per_iteration": 2.7212979793548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138192, + "balance_loss_mlp": 1.10519481, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06773344256027236, + "language_loss": 0.96352422, + "learning_rate": 0.00099940961163407, + "loss": 0.97490609, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33007812, + "step": 234, + "time_per_iteration": 2.901205062866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136264, + "balance_loss_mlp": 1.10300505, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061338570366332154, + "language_loss": 0.98733097, + "learning_rate": 0.0009993943795076528, + "loss": 0.99869365, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33251953, + "step": 235, + "time_per_iteration": 2.6201589107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132034, + "balance_loss_mlp": 1.09834564, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07983858027410345, + "language_loss": 1.00555849, + "learning_rate": 0.0009993789534991427, + "loss": 1.01687884, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.33691406, + "step": 236, + "time_per_iteration": 2.454946279525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132728, + "balance_loss_mlp": 1.0996356, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.08392760705769248, + "language_loss": 0.95816457, + "learning_rate": 0.0009993633336145287, + "loss": 0.96949184, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.33056641, + "step": 237, + "time_per_iteration": 2.6566781997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128359, + "balance_loss_mlp": 1.09655356, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.08042180371297789, + "language_loss": 1.00147879, + "learning_rate": 0.0009993475198598752, + "loss": 1.01276243, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31811523, + "step": 238, + "time_per_iteration": 3.0513856410980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126034, + "balance_loss_mlp": 1.09301257, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.0829568534139584, + "language_loss": 0.95935237, + "learning_rate": 0.0009993315122413212, + "loss": 0.97061276, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33007812, + "step": 239, + "time_per_iteration": 2.659076690673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138389, + "balance_loss_mlp": 1.10458112, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.07781318144537454, + "language_loss": 0.96732402, + "learning_rate": 0.0009993153107650818, + "loss": 0.97870797, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.33813477, + "step": 240, + "time_per_iteration": 2.6491312980651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141869, + "balance_loss_mlp": 1.10593915, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09031233919320754, + "language_loss": 0.95913565, + "learning_rate": 0.0009992989154374468, + "loss": 0.97055435, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.35961914, + "step": 241, + "time_per_iteration": 2.5679047107696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153823, + "balance_loss_mlp": 1.11829901, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.07248816816072506, + "language_loss": 1.03108311, + "learning_rate": 0.0009992823262647817, + "loss": 1.04262137, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.35546875, + "step": 242, + "time_per_iteration": 2.7263669967651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146146, + "balance_loss_mlp": 1.11167073, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.08958834607355992, + "language_loss": 0.96952182, + "learning_rate": 0.0009992655432535264, + "loss": 0.98098326, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.3449707, + "step": 243, + "time_per_iteration": 2.7712135314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156392, + "balance_loss_mlp": 1.12115347, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.06980487860605987, + "language_loss": 0.97863543, + "learning_rate": 0.0009992485664101973, + "loss": 0.99019933, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.35229492, + "step": 244, + "time_per_iteration": 2.7024378776550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164354, + "balance_loss_mlp": 1.12825704, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.059856394455686884, + "language_loss": 0.9987036, + "learning_rate": 0.000999231395741385, + "loss": 1.01034713, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.36108398, + "step": 245, + "time_per_iteration": 3.1183571815490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165942, + "balance_loss_mlp": 1.13215792, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.0943556711706318, + "language_loss": 0.97312224, + "learning_rate": 0.0009992140312537557, + "loss": 0.98478168, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.33789062, + "step": 246, + "time_per_iteration": 2.6516497135162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144759, + "balance_loss_mlp": 1.11121345, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.07660143361567079, + "language_loss": 0.93426013, + "learning_rate": 0.000999196472954051, + "loss": 0.94570768, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.33569336, + "step": 247, + "time_per_iteration": 2.975703477859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06084414, + "balance_loss_mlp": 5.7578764, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.5887991941215185, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.85509264, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 3.265625, + "step": 248, + "time_per_iteration": 5.566707372665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147412, + "balance_loss_mlp": 1.11617875, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.08054908277290292, + "language_loss": 0.99819887, + "learning_rate": 0.0009991607749457578, + "loss": 1.00967312, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.31225586, + "step": 249, + "time_per_iteration": 2.601257801055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179313, + "balance_loss_mlp": 1.14769912, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.0802426637388702, + "language_loss": 0.97979879, + "learning_rate": 0.0009991426352510286, + "loss": 0.99159187, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31591797, + "step": 250, + "time_per_iteration": 3.036884069442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221052, + "balance_loss_mlp": 1.18660045, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.10047763480801107, + "language_loss": 0.99211901, + "learning_rate": 0.0009991243017719422, + "loss": 1.00432956, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.34448242, + "step": 251, + "time_per_iteration": 2.6728298664093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221015, + "balance_loss_mlp": 1.18696856, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09158100422304945, + "language_loss": 0.93989825, + "learning_rate": 0.0009991057745156165, + "loss": 0.95210844, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34033203, + "step": 252, + "time_per_iteration": 2.6554462909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03874573, + "balance_loss_mlp": 3.65637207, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.4297237142905687, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85785556, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 2.1875, + "step": 253, + "time_per_iteration": 5.027901649475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243575, + "balance_loss_mlp": 1.20886147, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.13167813172379958, + "language_loss": 1.02751815, + "learning_rate": 0.0009990681387000943, + "loss": 1.03995395, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.34716797, + "step": 254, + "time_per_iteration": 2.830775260925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287984, + "balance_loss_mlp": 1.25019443, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.07749867859212424, + "language_loss": 0.9817788, + "learning_rate": 0.0009990490301555093, + "loss": 0.99465859, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.37792969, + "step": 255, + "time_per_iteration": 2.9786195755004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04361559, + "balance_loss_mlp": 4.02739191, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.4777758897592442, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.83576715, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 3.34375, + "step": 256, + "time_per_iteration": 4.883893013000488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03108787, + "balance_loss_mlp": 2.91576314, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.20418401203526695, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8235153, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.9296875, + "step": 257, + "time_per_iteration": 4.981754541397095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03419801, + "balance_loss_mlp": 3.31070042, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4614192090098086, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73395681, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.09375, + "step": 258, + "time_per_iteration": 4.904312372207642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147617, + "balance_loss_mlp": 1.43730807, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.19757670960702672, + "language_loss": 0.92998719, + "learning_rate": 0.0009989706585723202, + "loss": 0.94474888, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.38867188, + "step": 259, + "time_per_iteration": 2.8021159172058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01539233, + "balance_loss_mlp": 1.49808145, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.19510301282339976, + "language_loss": 0.99383926, + "learning_rate": 0.0009989505813633442, + "loss": 1.00923157, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.41137695, + "step": 260, + "time_per_iteration": 2.6653668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01478791, + "balance_loss_mlp": 1.4348743, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.10786879930825251, + "language_loss": 0.98759341, + "learning_rate": 0.000998930310444573, + "loss": 1.00238132, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.43920898, + "step": 261, + "time_per_iteration": 2.7604081630706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432802, + "balance_loss_mlp": 1.38426006, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.09058377349206405, + "language_loss": 0.96455801, + "learning_rate": 0.0009989098458238765, + "loss": 0.97888601, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.4855957, + "step": 262, + "time_per_iteration": 2.8061673641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01428574, + "balance_loss_mlp": 1.3737855, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.09431506628041801, + "language_loss": 0.959288, + "learning_rate": 0.0009988891875091998, + "loss": 0.9735738, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.54833984, + "step": 263, + "time_per_iteration": 2.756467819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142654, + "balance_loss_mlp": 1.36974835, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.10391907645657336, + "language_loss": 0.90729272, + "learning_rate": 0.0009988683355085636, + "loss": 0.92155808, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.56787109, + "step": 264, + "time_per_iteration": 2.7685976028442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420401, + "balance_loss_mlp": 1.3644681, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09802606586789958, + "language_loss": 0.99670649, + "learning_rate": 0.000998847289830063, + "loss": 1.01091051, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.55957031, + "step": 265, + "time_per_iteration": 2.831874132156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390772, + "balance_loss_mlp": 1.34082305, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.13175698376961376, + "language_loss": 0.92018604, + "learning_rate": 0.0009988260504818682, + "loss": 0.93409377, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.49926758, + "step": 266, + "time_per_iteration": 2.5666043758392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364075, + "balance_loss_mlp": 1.31720233, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.11617121831129276, + "language_loss": 0.98586178, + "learning_rate": 0.000998804617472226, + "loss": 0.99950248, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.46899414, + "step": 267, + "time_per_iteration": 2.683875322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339401, + "balance_loss_mlp": 1.29844046, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.13482471872787388, + "language_loss": 0.93566334, + "learning_rate": 0.0009987829908094568, + "loss": 0.94905734, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.40966797, + "step": 268, + "time_per_iteration": 2.844641923904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_mlp": 1.23007023, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.10753691268900553, + "language_loss": 1.00233316, + "learning_rate": 0.0009987611705019569, + "loss": 1.01503825, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.40454102, + "step": 269, + "time_per_iteration": 4.188141107559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223109, + "balance_loss_mlp": 1.1811955, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.09459011438584931, + "language_loss": 0.9928273, + "learning_rate": 0.0009987391565581978, + "loss": 1.00505841, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.41943359, + "step": 270, + "time_per_iteration": 2.603743076324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187631, + "balance_loss_mlp": 1.14400077, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06481058483540457, + "language_loss": 0.91893035, + "learning_rate": 0.000998716948986726, + "loss": 0.93080664, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.43652344, + "step": 271, + "time_per_iteration": 2.8780717849731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189162, + "balance_loss_mlp": 1.14545989, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0816946734367831, + "language_loss": 0.93787229, + "learning_rate": 0.0009986945477961633, + "loss": 0.94976389, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.4375, + "step": 272, + "time_per_iteration": 2.723017692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181133, + "balance_loss_mlp": 1.13828969, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.0734282707774283, + "language_loss": 0.99389303, + "learning_rate": 0.0009986719529952066, + "loss": 1.00570428, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.4284668, + "step": 273, + "time_per_iteration": 2.8852784633636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175268, + "balance_loss_mlp": 1.13082659, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.10629611668364672, + "language_loss": 0.98564589, + "learning_rate": 0.000998649164592628, + "loss": 0.99739856, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.44458008, + "step": 274, + "time_per_iteration": 2.616504430770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151835, + "balance_loss_mlp": 1.1077987, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.10641977070505904, + "language_loss": 0.95747149, + "learning_rate": 0.0009986261825972748, + "loss": 0.96898991, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.44018555, + "step": 275, + "time_per_iteration": 2.7185463905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.12447667, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09271858345864015, + "language_loss": 0.98292786, + "learning_rate": 0.000998603007018069, + "loss": 0.99463308, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.46044922, + "step": 276, + "time_per_iteration": 2.884373188018799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120113, + "balance_loss_mlp": 1.15065718, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06824174267425122, + "language_loss": 0.95424223, + "learning_rate": 0.0009985796378640089, + "loss": 0.96625352, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.50512695, + "step": 277, + "time_per_iteration": 2.766671895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196508, + "balance_loss_mlp": 1.14670205, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07462742938020851, + "language_loss": 0.95504081, + "learning_rate": 0.0009985560751441665, + "loss": 0.96700585, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.49829102, + "step": 278, + "time_per_iteration": 2.8290188312530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202501, + "balance_loss_mlp": 1.1519084, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.08249787624351518, + "language_loss": 0.97367889, + "learning_rate": 0.00099853231886769, + "loss": 0.98570395, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.5065918, + "step": 279, + "time_per_iteration": 2.7985732555389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208136, + "balance_loss_mlp": 1.15880692, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06817333546872655, + "language_loss": 0.98251152, + "learning_rate": 0.0009985083690438024, + "loss": 0.99459285, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.4934082, + "step": 280, + "time_per_iteration": 2.711107015609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120231, + "balance_loss_mlp": 1.15419662, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.06285675396315912, + "language_loss": 0.88899338, + "learning_rate": 0.0009984842256818016, + "loss": 0.90101647, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.48095703, + "step": 281, + "time_per_iteration": 3.089395761489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118336, + "balance_loss_mlp": 1.13934779, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.09184892817545263, + "language_loss": 0.99464393, + "learning_rate": 0.0009984598887910613, + "loss": 1.00647748, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.43994141, + "step": 282, + "time_per_iteration": 2.809372663497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193758, + "balance_loss_mlp": 1.14736223, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0862697219544723, + "language_loss": 0.95099992, + "learning_rate": 0.0009984353583810297, + "loss": 0.96293747, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.46386719, + "step": 283, + "time_per_iteration": 2.887547016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174794, + "balance_loss_mlp": 1.12997127, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.07077343192171563, + "language_loss": 0.96608889, + "learning_rate": 0.0009984106344612302, + "loss": 0.97783673, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.44799805, + "step": 284, + "time_per_iteration": 2.7930290699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158485, + "balance_loss_mlp": 1.11640382, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.07340568947827376, + "language_loss": 0.92955279, + "learning_rate": 0.0009983857170412615, + "loss": 0.94113761, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.42089844, + "step": 285, + "time_per_iteration": 3.0093743801116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165069, + "balance_loss_mlp": 1.1219871, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.05960836075086468, + "language_loss": 0.92676461, + "learning_rate": 0.000998360606130798, + "loss": 0.93841541, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.43041992, + "step": 286, + "time_per_iteration": 2.837170362472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03710432, + "balance_loss_mlp": 3.53495646, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.1985650778679295, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.72783548, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.7578125, + "step": 287, + "time_per_iteration": 4.929426908493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157732, + "balance_loss_mlp": 1.11290884, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08938470510968509, + "language_loss": 0.98063755, + "learning_rate": 0.0009983098038774552, + "loss": 0.99221486, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.44799805, + "step": 288, + "time_per_iteration": 2.8677265644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03445158, + "balance_loss_mlp": 3.31088066, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.2206810579053755, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.81615388, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.34375, + "step": 289, + "time_per_iteration": 4.795354604721069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204321, + "balance_loss_mlp": 1.15992737, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.08343344919246831, + "language_loss": 0.96212429, + "learning_rate": 0.0009982582277800948, + "loss": 0.97416747, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.44360352, + "step": 290, + "time_per_iteration": 2.610515832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201009, + "balance_loss_mlp": 1.15659118, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09373610552028779, + "language_loss": 1.02980018, + "learning_rate": 0.0009982321495648908, + "loss": 1.04181027, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.4440918, + "step": 291, + "time_per_iteration": 2.847222089767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213433, + "balance_loss_mlp": 1.16884899, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.12267020035660053, + "language_loss": 0.94884562, + "learning_rate": 0.0009982058779188115, + "loss": 0.96097994, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.44604492, + "step": 292, + "time_per_iteration": 2.7585439682006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190958, + "balance_loss_mlp": 1.14596868, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.07287048907504978, + "language_loss": 1.01494539, + "learning_rate": 0.0009981794128520567, + "loss": 1.02685499, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.44970703, + "step": 293, + "time_per_iteration": 2.8542449474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194856, + "balance_loss_mlp": 1.14817381, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.060100997943795566, + "language_loss": 0.98246396, + "learning_rate": 0.000998152754374901, + "loss": 0.99441248, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.46704102, + "step": 294, + "time_per_iteration": 2.897792100906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183893, + "balance_loss_mlp": 1.13856936, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.0698691020933478, + "language_loss": 0.94496101, + "learning_rate": 0.0009981259024976943, + "loss": 0.95679998, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.453125, + "step": 295, + "time_per_iteration": 2.7404842376708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186618, + "balance_loss_mlp": 1.14067447, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.10167990029855892, + "language_loss": 0.92340136, + "learning_rate": 0.0009980988572308612, + "loss": 0.93526757, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.45922852, + "step": 296, + "time_per_iteration": 3.007516384124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169257, + "balance_loss_mlp": 1.12450624, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.07320939901236567, + "language_loss": 0.95507723, + "learning_rate": 0.0009980716185849015, + "loss": 0.96676981, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44775391, + "step": 297, + "time_per_iteration": 2.9953107833862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163884, + "balance_loss_mlp": 1.12180316, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06695295039959538, + "language_loss": 0.92045325, + "learning_rate": 0.0009980441865703904, + "loss": 0.93209207, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4206543, + "step": 298, + "time_per_iteration": 2.6119296550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149691, + "balance_loss_mlp": 1.10896909, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07389257813376128, + "language_loss": 1.00092888, + "learning_rate": 0.000998016561197978, + "loss": 1.0124259, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.40698242, + "step": 299, + "time_per_iteration": 2.776057004928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139916, + "balance_loss_mlp": 1.10072017, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.08850581007108178, + "language_loss": 0.91981971, + "learning_rate": 0.0009979887424783895, + "loss": 0.93121886, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.39208984, + "step": 300, + "time_per_iteration": 2.9253783226013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114416, + "balance_loss_mlp": 1.10362935, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.06286570971506464, + "language_loss": 0.91965425, + "learning_rate": 0.0009979607304224248, + "loss": 0.93109584, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.40527344, + "step": 301, + "time_per_iteration": 2.7880210876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148274, + "balance_loss_mlp": 1.10626435, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07282163575611278, + "language_loss": 0.98193479, + "learning_rate": 0.000997932525040959, + "loss": 0.9934175, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.41992188, + "step": 302, + "time_per_iteration": 2.6913211345672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135024, + "balance_loss_mlp": 1.09647226, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.08010118219410382, + "language_loss": 1.00433981, + "learning_rate": 0.000997904126344943, + "loss": 1.01569009, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.38549805, + "step": 303, + "time_per_iteration": 2.648486375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_mlp": 1.112535, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.07274054196633538, + "language_loss": 0.95938694, + "learning_rate": 0.0009978755343454018, + "loss": 0.97091049, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.39794922, + "step": 304, + "time_per_iteration": 2.7488231658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162494, + "balance_loss_mlp": 1.12279713, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07785655900909055, + "language_loss": 0.97099572, + "learning_rate": 0.0009978467490534355, + "loss": 0.98262066, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.39697266, + "step": 305, + "time_per_iteration": 2.5928122997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161095, + "balance_loss_mlp": 1.12101698, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06710807116161162, + "language_loss": 0.94506705, + "learning_rate": 0.00099781777048022, + "loss": 0.95667803, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.40087891, + "step": 306, + "time_per_iteration": 2.7071874141693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12727094, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.06805578843696672, + "language_loss": 0.95336848, + "learning_rate": 0.0009977885986370057, + "loss": 0.96503407, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.39282227, + "step": 307, + "time_per_iteration": 2.560727119445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181694, + "balance_loss_mlp": 1.14190209, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07408509854998435, + "language_loss": 0.92084455, + "learning_rate": 0.000997759233535118, + "loss": 0.93266147, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.39770508, + "step": 308, + "time_per_iteration": 2.811706304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199188, + "balance_loss_mlp": 1.15813279, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.11332767927985109, + "language_loss": 0.97065681, + "learning_rate": 0.0009977296751859576, + "loss": 0.98264867, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.41040039, + "step": 309, + "time_per_iteration": 2.8100500106811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182124, + "balance_loss_mlp": 1.14152098, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.06886541031259097, + "language_loss": 0.99580777, + "learning_rate": 0.0009976999236009998, + "loss": 1.00762904, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.40576172, + "step": 310, + "time_per_iteration": 2.7856838703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116778, + "balance_loss_mlp": 1.12984788, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.07671642451322926, + "language_loss": 1.00938904, + "learning_rate": 0.0009976699787917955, + "loss": 1.02106678, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37939453, + "step": 311, + "time_per_iteration": 2.679760217666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01912771, + "balance_loss_mlp": 1.87653184, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.11004817833063929, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75355768, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.36328125, + "step": 312, + "time_per_iteration": 5.035902976989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167025, + "balance_loss_mlp": 1.12742412, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.08745367830689305, + "language_loss": 0.92707014, + "learning_rate": 0.0009976095095472243, + "loss": 0.93874037, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39575195, + "step": 313, + "time_per_iteration": 2.606323480606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137298, + "balance_loss_mlp": 1.10091519, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.07680079441574393, + "language_loss": 0.94012022, + "learning_rate": 0.0009975789851353334, + "loss": 0.95149314, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.36352539, + "step": 314, + "time_per_iteration": 2.838961362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135404, + "balance_loss_mlp": 1.10076201, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07916345547758051, + "language_loss": 0.96821368, + "learning_rate": 0.0009975482675461487, + "loss": 0.97956777, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.34643555, + "step": 315, + "time_per_iteration": 2.6935253143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122263, + "balance_loss_mlp": 1.08905149, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.06025100036286014, + "language_loss": 0.94348001, + "learning_rate": 0.0009975173567915952, + "loss": 0.95470262, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.33203125, + "step": 316, + "time_per_iteration": 2.784148931503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123821, + "balance_loss_mlp": 1.08903599, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.06288070363718151, + "language_loss": 0.8781901, + "learning_rate": 0.000997486252883674, + "loss": 0.88942832, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.34765625, + "step": 317, + "time_per_iteration": 2.8335070610046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130287, + "balance_loss_mlp": 1.09628844, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.08951651385504938, + "language_loss": 0.93891156, + "learning_rate": 0.0009974549558344602, + "loss": 0.95021445, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.33984375, + "step": 318, + "time_per_iteration": 3.661447048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140147, + "balance_loss_mlp": 1.10564828, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.0956877361367619, + "language_loss": 1.0199635, + "learning_rate": 0.000997423465656105, + "loss": 1.03136492, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.34521484, + "step": 319, + "time_per_iteration": 2.7822437286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124104, + "balance_loss_mlp": 1.08896148, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.10289983756800847, + "language_loss": 0.99710345, + "learning_rate": 0.0009973917823608335, + "loss": 1.00834441, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.3515625, + "step": 320, + "time_per_iteration": 2.631345272064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135481, + "balance_loss_mlp": 1.09964669, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.0680580088694669, + "language_loss": 0.95663267, + "learning_rate": 0.0009973599059609462, + "loss": 0.96798748, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.35839844, + "step": 321, + "time_per_iteration": 2.7266485691070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117054, + "balance_loss_mlp": 1.13201189, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.07460436538347456, + "language_loss": 0.9288404, + "learning_rate": 0.000997327836468819, + "loss": 0.9405458, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.38525391, + "step": 322, + "time_per_iteration": 2.673107385635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179324, + "balance_loss_mlp": 1.14246416, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.08768405045584388, + "language_loss": 0.95868701, + "learning_rate": 0.000997295573896902, + "loss": 0.9704802, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.36865234, + "step": 323, + "time_per_iteration": 2.89715838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974818, + "balance_loss_mlp": 1.93609941, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.15129070182137194, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83170861, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.38671875, + "step": 324, + "time_per_iteration": 4.777086496353149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01725792, + "balance_loss_mlp": 1.68592823, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.07336881302622408, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80297732, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.3984375, + "step": 325, + "time_per_iteration": 4.8852620124816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203893, + "balance_loss_mlp": 1.16684282, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.09305137701088195, + "language_loss": 0.90879977, + "learning_rate": 0.000997197627828043, + "loss": 0.92083865, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.37060547, + "step": 326, + "time_per_iteration": 2.615715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198178, + "balance_loss_mlp": 1.16174805, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.10757754770856821, + "language_loss": 0.86059356, + "learning_rate": 0.0009971645930629716, + "loss": 0.8725754, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.36450195, + "step": 327, + "time_per_iteration": 2.7753512859344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193235, + "balance_loss_mlp": 1.15790117, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0829627430200847, + "language_loss": 0.98908973, + "learning_rate": 0.0009971313652814872, + "loss": 1.00102198, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.35351562, + "step": 328, + "time_per_iteration": 2.8697071075439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183904, + "balance_loss_mlp": 1.14957154, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.07808148320278054, + "language_loss": 0.9654116, + "learning_rate": 0.0009970979444964903, + "loss": 0.97725058, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.34350586, + "step": 329, + "time_per_iteration": 3.013674259185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179752, + "balance_loss_mlp": 1.14446568, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.08385245466398004, + "language_loss": 0.97686106, + "learning_rate": 0.0009970643307209556, + "loss": 0.98865855, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.3527832, + "step": 330, + "time_per_iteration": 2.868323802947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168029, + "balance_loss_mlp": 1.13097858, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08206463725837071, + "language_loss": 0.93874633, + "learning_rate": 0.0009970305239679334, + "loss": 0.95042664, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.37060547, + "step": 331, + "time_per_iteration": 2.8225202560424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178202, + "balance_loss_mlp": 1.14210534, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.07579712662789459, + "language_loss": 0.98774493, + "learning_rate": 0.0009969965242505483, + "loss": 0.99952692, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.36108398, + "step": 332, + "time_per_iteration": 2.8107545375823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168447, + "balance_loss_mlp": 1.13325644, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.07917811788525977, + "language_loss": 0.94783902, + "learning_rate": 0.0009969623315820007, + "loss": 0.95952344, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.35180664, + "step": 333, + "time_per_iteration": 2.698505401611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171432, + "balance_loss_mlp": 1.13636017, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.0763666551446786, + "language_loss": 0.95210952, + "learning_rate": 0.000996927945975565, + "loss": 0.96382385, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.35083008, + "step": 334, + "time_per_iteration": 2.584472894668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115222, + "balance_loss_mlp": 1.11686206, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.08033900057819754, + "language_loss": 0.91956127, + "learning_rate": 0.0009968933674445906, + "loss": 0.93108344, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.35375977, + "step": 335, + "time_per_iteration": 2.689556837081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114559, + "balance_loss_mlp": 1.11109114, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.06825993333788044, + "language_loss": 0.94537115, + "learning_rate": 0.0009968585960025028, + "loss": 0.95682704, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.34521484, + "step": 336, + "time_per_iteration": 3.009956121444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02330067, + "balance_loss_mlp": 2.29764199, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.13230953132672904, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79983252, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.32421875, + "step": 337, + "time_per_iteration": 4.800926685333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126942, + "balance_loss_mlp": 1.09404051, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.06377137616855041, + "language_loss": 0.92515147, + "learning_rate": 0.0009967884744390583, + "loss": 0.93642092, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.32910156, + "step": 338, + "time_per_iteration": 3.5464487075805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124603, + "balance_loss_mlp": 1.09043801, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.0855348813631026, + "language_loss": 0.93111128, + "learning_rate": 0.0009967531243449256, + "loss": 0.9423573, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34155273, + "step": 339, + "time_per_iteration": 2.6777007579803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131136, + "balance_loss_mlp": 1.09642255, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07604626819248426, + "language_loss": 1.00833654, + "learning_rate": 0.000996717581394126, + "loss": 1.01964784, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.34741211, + "step": 340, + "time_per_iteration": 2.6667256355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145305, + "balance_loss_mlp": 1.10975671, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07959679456110856, + "language_loss": 1.00992751, + "learning_rate": 0.000996681845600459, + "loss": 1.02138054, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.35571289, + "step": 341, + "time_per_iteration": 2.6750872135162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168019, + "balance_loss_mlp": 1.13158822, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07803079751348088, + "language_loss": 0.92980075, + "learning_rate": 0.0009966459169777982, + "loss": 0.94148099, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.36425781, + "step": 342, + "time_per_iteration": 2.5240936279296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186775, + "balance_loss_mlp": 1.14920056, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07114695189108672, + "language_loss": 1.02233219, + "learning_rate": 0.0009966097955400924, + "loss": 1.03419995, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.37597656, + "step": 343, + "time_per_iteration": 2.701003313064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182072, + "balance_loss_mlp": 1.14444947, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.07450598076984326, + "language_loss": 0.95542282, + "learning_rate": 0.0009965734813013652, + "loss": 0.96724355, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.3762207, + "step": 344, + "time_per_iteration": 2.823782444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196951, + "balance_loss_mlp": 1.15773153, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.0604450427343926, + "language_loss": 0.97975069, + "learning_rate": 0.0009965369742757151, + "loss": 0.9917202, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.39208984, + "step": 345, + "time_per_iteration": 2.5793161392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222733, + "balance_loss_mlp": 1.18341792, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.07564429768448787, + "language_loss": 0.95189452, + "learning_rate": 0.0009965002744773152, + "loss": 0.96412188, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.39306641, + "step": 346, + "time_per_iteration": 3.5293569564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225458, + "balance_loss_mlp": 1.18573725, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.07389332256160373, + "language_loss": 0.91674209, + "learning_rate": 0.0009964633819204139, + "loss": 0.92899668, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.3972168, + "step": 347, + "time_per_iteration": 2.672184705734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01858873, + "balance_loss_mlp": 1.81805611, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.060031539331637095, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83659983, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40820312, + "step": 348, + "time_per_iteration": 4.947252988815308 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01829705, + "balance_loss_mlp": 1.78698003, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.05093987002559095, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76983589, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.42773438, + "step": 349, + "time_per_iteration": 4.949065923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200476, + "balance_loss_mlp": 1.16268659, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.10157855165040833, + "language_loss": 0.91835332, + "learning_rate": 0.000996351547842304, + "loss": 0.93035805, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.37792969, + "step": 350, + "time_per_iteration": 3.195343255996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_mlp": 1.13869905, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.09856595883672854, + "language_loss": 0.90272117, + "learning_rate": 0.0009963138843953744, + "loss": 0.91447508, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.3671875, + "step": 351, + "time_per_iteration": 2.6402506828308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141973, + "balance_loss_mlp": 1.10692537, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.061148145233813844, + "language_loss": 0.94241744, + "learning_rate": 0.000996276028262306, + "loss": 0.95383716, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.3503418, + "step": 352, + "time_per_iteration": 2.834099769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112107, + "balance_loss_mlp": 1.08011079, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.08429524036953953, + "language_loss": 1.00538242, + "learning_rate": 0.0009962379794577964, + "loss": 1.01650345, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.31982422, + "step": 353, + "time_per_iteration": 2.6607887744903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110445, + "balance_loss_mlp": 1.07780528, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.07871401687807635, + "language_loss": 0.91255635, + "learning_rate": 0.000996199737996617, + "loss": 0.92366076, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.32641602, + "step": 354, + "time_per_iteration": 2.977060317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106148, + "balance_loss_mlp": 1.07484412, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07891213217714192, + "language_loss": 0.99330544, + "learning_rate": 0.0009961613038936149, + "loss": 1.00436699, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.31274414, + "step": 355, + "time_per_iteration": 2.615016222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097974, + "balance_loss_mlp": 1.06619298, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.06589791904701883, + "language_loss": 0.92011106, + "learning_rate": 0.000996122677163711, + "loss": 0.93109083, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.31762695, + "step": 356, + "time_per_iteration": 2.844289541244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110581, + "balance_loss_mlp": 1.07848942, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.09636291923958067, + "language_loss": 0.97709715, + "learning_rate": 0.000996083857821902, + "loss": 0.98820293, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.32080078, + "step": 357, + "time_per_iteration": 3.0474655628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137059, + "balance_loss_mlp": 1.10334635, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.09472058747565097, + "language_loss": 0.95954913, + "learning_rate": 0.0009960448458832588, + "loss": 0.97091973, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.3371582, + "step": 358, + "time_per_iteration": 2.7681682109832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153475, + "balance_loss_mlp": 1.12002492, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.10342324791005938, + "language_loss": 0.95369232, + "learning_rate": 0.000996005641362927, + "loss": 0.96522713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.33447266, + "step": 359, + "time_per_iteration": 2.6423869132995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189921, + "balance_loss_mlp": 1.15472996, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.10829219970600838, + "language_loss": 0.98827034, + "learning_rate": 0.0009959662442761274, + "loss": 1.00016952, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.35205078, + "step": 360, + "time_per_iteration": 2.941234827041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185036, + "balance_loss_mlp": 1.14810538, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.0683919199988589, + "language_loss": 0.92245018, + "learning_rate": 0.000995926654638155, + "loss": 0.9343006, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.36938477, + "step": 361, + "time_per_iteration": 2.837684154510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202565, + "balance_loss_mlp": 1.16482282, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0951215156771631, + "language_loss": 0.9350909, + "learning_rate": 0.00099588687246438, + "loss": 0.94711655, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.37719727, + "step": 362, + "time_per_iteration": 2.9100425243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203748, + "balance_loss_mlp": 1.16460001, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.11257096193086513, + "language_loss": 1.01560402, + "learning_rate": 0.0009958468977702471, + "loss": 1.02764153, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.39160156, + "step": 363, + "time_per_iteration": 2.6317808628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01643136, + "balance_loss_mlp": 1.57790494, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.0741524690412032, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81377846, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.65234375, + "step": 364, + "time_per_iteration": 4.827174663543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187013, + "balance_loss_mlp": 1.15229964, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.07557653682461403, + "language_loss": 0.89914072, + "learning_rate": 0.0009957663708830612, + "loss": 0.91101086, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.34741211, + "step": 365, + "time_per_iteration": 3.280808448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201016, + "balance_loss_mlp": 1.16401315, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.11033601827195522, + "language_loss": 0.91889954, + "learning_rate": 0.0009957258187212714, + "loss": 0.93090969, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.36987305, + "step": 366, + "time_per_iteration": 3.0436058044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494507, + "balance_loss_mlp": 1.43309093, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.06331255113068197, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80689365, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.61328125, + "step": 367, + "time_per_iteration": 4.807323694229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209897, + "balance_loss_mlp": 1.17287028, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.07799784999620897, + "language_loss": 0.8953917, + "learning_rate": 0.0009956441370400167, + "loss": 0.90749061, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.37036133, + "step": 368, + "time_per_iteration": 2.678028106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218753, + "balance_loss_mlp": 1.18270361, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11766553351136624, + "language_loss": 0.9529528, + "learning_rate": 0.0009956030075522636, + "loss": 0.96514034, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3605957, + "step": 369, + "time_per_iteration": 2.7700181007385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195721, + "balance_loss_mlp": 1.16050696, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.07977968738165528, + "language_loss": 0.95944411, + "learning_rate": 0.0009955616856543587, + "loss": 0.97140133, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.35205078, + "step": 370, + "time_per_iteration": 2.6467819213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011765, + "balance_loss_mlp": 1.14142823, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.07610844541313569, + "language_loss": 0.88427055, + "learning_rate": 0.0009955201713623448, + "loss": 0.89603543, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.35083008, + "step": 371, + "time_per_iteration": 2.8926849365234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_mlp": 1.21208656, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.04749961224137468, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77934778, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "step": 372, + "time_per_iteration": 4.978249549865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137568, + "balance_loss_mlp": 1.10769427, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.10296972579398556, + "language_loss": 1.02312946, + "learning_rate": 0.0009954365656605333, + "loss": 1.03450513, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.29882812, + "step": 373, + "time_per_iteration": 2.5930416584014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163699, + "balance_loss_mlp": 1.1306777, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.08216565506059122, + "language_loss": 0.94662046, + "learning_rate": 0.0009953944742831947, + "loss": 0.95825744, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.33007812, + "step": 374, + "time_per_iteration": 3.02325701713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149175, + "balance_loss_mlp": 1.1185143, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.11719346683047478, + "language_loss": 0.98373723, + "learning_rate": 0.0009953521905766642, + "loss": 0.99522901, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.30639648, + "step": 375, + "time_per_iteration": 2.972064733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156803, + "balance_loss_mlp": 1.12435448, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.06602159555114745, + "language_loss": 0.97082627, + "learning_rate": 0.0009953097145573577, + "loss": 0.98239434, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.32446289, + "step": 376, + "time_per_iteration": 2.6647017002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183036, + "balance_loss_mlp": 1.14922833, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.0696983564537716, + "language_loss": 0.94069874, + "learning_rate": 0.000995267046241766, + "loss": 0.95252913, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.33837891, + "step": 377, + "time_per_iteration": 3.2699291706085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186549, + "balance_loss_mlp": 1.15281284, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.08226328739164854, + "language_loss": 0.94401312, + "learning_rate": 0.0009952241856464547, + "loss": 0.95587862, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.33764648, + "step": 378, + "time_per_iteration": 2.6432976722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191312, + "balance_loss_mlp": 1.15698004, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.12013480935274141, + "language_loss": 1.00853705, + "learning_rate": 0.0009951811327880632, + "loss": 1.02045012, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.34350586, + "step": 379, + "time_per_iteration": 2.822204828262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192206, + "balance_loss_mlp": 1.15858889, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.08341634879178654, + "language_loss": 0.94250029, + "learning_rate": 0.0009951378876833063, + "loss": 0.95442235, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.3359375, + "step": 380, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198017, + "balance_loss_mlp": 1.16311216, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.09052350379823415, + "language_loss": 1.00640893, + "learning_rate": 0.0009950944503489736, + "loss": 1.01838911, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34936523, + "step": 381, + "time_per_iteration": 2.758171796798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202881, + "balance_loss_mlp": 1.16811991, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.08361033479665086, + "language_loss": 0.95423895, + "learning_rate": 0.0009950508208019285, + "loss": 0.96626776, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.34741211, + "step": 382, + "time_per_iteration": 2.9980571269989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187801, + "balance_loss_mlp": 1.15489948, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.06841783055573346, + "language_loss": 0.99123466, + "learning_rate": 0.0009950069990591096, + "loss": 1.00311255, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.32910156, + "step": 383, + "time_per_iteration": 2.723439931869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01573707, + "balance_loss_mlp": 1.54185438, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.1397468631511101, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77975076, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.31835938, + "step": 384, + "time_per_iteration": 4.962388753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189426, + "balance_loss_mlp": 1.15592778, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.0845037323153299, + "language_loss": 0.92480063, + "learning_rate": 0.0009949187790542777, + "loss": 0.93669498, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.33496094, + "step": 385, + "time_per_iteration": 2.7766611576080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193494, + "balance_loss_mlp": 1.16052091, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.0971687641338208, + "language_loss": 0.884184, + "learning_rate": 0.0009948743808265148, + "loss": 0.89611894, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.32983398, + "step": 386, + "time_per_iteration": 2.674055576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183765, + "balance_loss_mlp": 1.15150666, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.07384542423184925, + "language_loss": 0.97962248, + "learning_rate": 0.0009948297904714782, + "loss": 0.9914602, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.32250977, + "step": 387, + "time_per_iteration": 2.698899745941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179922, + "balance_loss_mlp": 1.14680552, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.06832562007069648, + "language_loss": 0.90421599, + "learning_rate": 0.0009947850080064796, + "loss": 0.91601527, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.33105469, + "step": 388, + "time_per_iteration": 2.8182406425476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178557, + "balance_loss_mlp": 1.14639437, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.06958908790939329, + "language_loss": 0.94972193, + "learning_rate": 0.0009947400334489047, + "loss": 0.96150756, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.3215332, + "step": 389, + "time_per_iteration": 3.0191807746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180436, + "balance_loss_mlp": 1.14767742, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.0847539772518024, + "language_loss": 0.86555678, + "learning_rate": 0.0009946948668162145, + "loss": 0.87736106, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.32763672, + "step": 390, + "time_per_iteration": 2.7670745849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182484, + "balance_loss_mlp": 1.14886689, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.08648624436703037, + "language_loss": 0.91666478, + "learning_rate": 0.0009946495081259441, + "loss": 0.92848963, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.33618164, + "step": 391, + "time_per_iteration": 2.8355910778045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168167, + "balance_loss_mlp": 1.13454986, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.09254550247646448, + "language_loss": 0.94977629, + "learning_rate": 0.0009946039573957035, + "loss": 0.96145797, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.33618164, + "step": 392, + "time_per_iteration": 2.9576094150543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143715, + "balance_loss_mlp": 1.11300731, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.06908129255101257, + "language_loss": 0.91881704, + "learning_rate": 0.000994558214643177, + "loss": 0.93025422, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.30712891, + "step": 393, + "time_per_iteration": 2.757168769836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141782, + "balance_loss_mlp": 1.11102629, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.06274973991827922, + "language_loss": 0.93209511, + "learning_rate": 0.000994512279886123, + "loss": 0.94351292, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30712891, + "step": 394, + "time_per_iteration": 3.1078224182128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.10523462, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.07515736533799398, + "language_loss": 0.93902445, + "learning_rate": 0.0009944661531423758, + "loss": 0.95037174, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.29492188, + "step": 395, + "time_per_iteration": 2.6783392429351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149746, + "balance_loss_mlp": 1.12061143, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.07362715907626581, + "language_loss": 0.91989446, + "learning_rate": 0.000994419834429843, + "loss": 0.93139195, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29125977, + "step": 396, + "time_per_iteration": 2.6774208545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138515, + "balance_loss_mlp": 1.10887921, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.0979297809656427, + "language_loss": 0.95834494, + "learning_rate": 0.0009943733237665069, + "loss": 0.96973014, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.29663086, + "step": 397, + "time_per_iteration": 2.8543148040771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162601, + "balance_loss_mlp": 1.13260818, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.07305506526715269, + "language_loss": 0.95531559, + "learning_rate": 0.0009943266211704248, + "loss": 0.96694154, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.29956055, + "step": 398, + "time_per_iteration": 2.9546711444854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155699, + "balance_loss_mlp": 1.12427545, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.0773299202354709, + "language_loss": 0.97448099, + "learning_rate": 0.000994279726659728, + "loss": 0.98603797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31396484, + "step": 399, + "time_per_iteration": 2.51406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178721, + "balance_loss_mlp": 1.14610541, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.0761918911056457, + "language_loss": 0.9424448, + "learning_rate": 0.0009942326402526231, + "loss": 0.95423204, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.32617188, + "step": 400, + "time_per_iteration": 2.578338146209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175813, + "balance_loss_mlp": 1.14300704, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0730936916243032, + "language_loss": 0.93335903, + "learning_rate": 0.0009941853619673902, + "loss": 0.94511712, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.328125, + "step": 401, + "time_per_iteration": 2.6568922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175231, + "balance_loss_mlp": 1.14356887, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.0850905540992329, + "language_loss": 0.95842957, + "learning_rate": 0.0009941378918223844, + "loss": 0.97018182, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.31616211, + "step": 402, + "time_per_iteration": 3.098615884780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204282, + "balance_loss_mlp": 1.17018807, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.09392353942632323, + "language_loss": 0.9004057, + "learning_rate": 0.0009940902298360354, + "loss": 0.91244853, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34130859, + "step": 403, + "time_per_iteration": 2.769843101501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188661, + "balance_loss_mlp": 1.15563989, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.0817674600565604, + "language_loss": 0.98311555, + "learning_rate": 0.0009940423760268473, + "loss": 0.99500215, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.33007812, + "step": 404, + "time_per_iteration": 2.8945982456207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187921, + "balance_loss_mlp": 1.15442348, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.0859899885976376, + "language_loss": 0.92015374, + "learning_rate": 0.0009939943304133982, + "loss": 0.93203294, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.33496094, + "step": 405, + "time_per_iteration": 2.649461269378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172828, + "balance_loss_mlp": 1.14228618, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.07444114263212052, + "language_loss": 0.99398023, + "learning_rate": 0.0009939460930143416, + "loss": 1.00570846, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.30517578, + "step": 406, + "time_per_iteration": 2.667829990386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181986, + "balance_loss_mlp": 1.15091991, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.08442820151342731, + "language_loss": 0.93529546, + "learning_rate": 0.0009938976638484043, + "loss": 0.9471153, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.31054688, + "step": 407, + "time_per_iteration": 2.9581079483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184799, + "balance_loss_mlp": 1.15428162, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.08907940163556441, + "language_loss": 0.91453135, + "learning_rate": 0.0009938490429343887, + "loss": 0.92637932, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.30493164, + "step": 408, + "time_per_iteration": 2.6066792011260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198916, + "balance_loss_mlp": 1.16708684, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.09407218950155852, + "language_loss": 0.92654747, + "learning_rate": 0.0009938002302911709, + "loss": 0.93853664, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.31835938, + "step": 409, + "time_per_iteration": 2.7762253284454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206718, + "balance_loss_mlp": 1.17415047, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.10932104394797525, + "language_loss": 0.95012206, + "learning_rate": 0.0009937512259377015, + "loss": 0.96218926, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.32543945, + "step": 410, + "time_per_iteration": 2.7103991508483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265174, + "balance_loss_mlp": 1.23193812, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.08720536696991275, + "language_loss": 0.94637173, + "learning_rate": 0.000993702029893006, + "loss": 0.95902348, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.33251953, + "step": 411, + "time_per_iteration": 2.78560733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295676, + "balance_loss_mlp": 1.26029515, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.11720891364975168, + "language_loss": 0.93816972, + "learning_rate": 0.0009936526421761838, + "loss": 0.95112646, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.35400391, + "step": 412, + "time_per_iteration": 3.049868583679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128071, + "balance_loss_mlp": 1.24611533, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.095587468789244, + "language_loss": 0.96658343, + "learning_rate": 0.000993603062806409, + "loss": 0.9793905, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.34619141, + "step": 413, + "time_per_iteration": 2.6881110668182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262528, + "balance_loss_mlp": 1.22843432, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.10701391534122558, + "language_loss": 0.98645592, + "learning_rate": 0.0009935532918029298, + "loss": 0.99908125, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.34082031, + "step": 414, + "time_per_iteration": 2.6234540939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253352, + "balance_loss_mlp": 1.21847153, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.10153449079868698, + "language_loss": 0.92723763, + "learning_rate": 0.0009935033291850694, + "loss": 0.93977106, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.34887695, + "step": 415, + "time_per_iteration": 2.6565287113189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224774, + "balance_loss_mlp": 1.19258738, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09081981361814888, + "language_loss": 0.94647777, + "learning_rate": 0.0009934531749722247, + "loss": 0.95872557, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.32177734, + "step": 416, + "time_per_iteration": 2.6504123210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214952, + "balance_loss_mlp": 1.18243122, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.08798076505254328, + "language_loss": 0.92810607, + "learning_rate": 0.0009934028291838672, + "loss": 0.94025552, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.32495117, + "step": 417, + "time_per_iteration": 2.7589621543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202827, + "balance_loss_mlp": 1.17018712, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.08954115452054644, + "language_loss": 0.88617092, + "learning_rate": 0.0009933522918395433, + "loss": 0.8981992, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.32592773, + "step": 418, + "time_per_iteration": 2.713758707046509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01389029, + "balance_loss_mlp": 1.35851097, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.08425204298586858, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79640126, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.3046875, + "step": 419, + "time_per_iteration": 4.9331464767456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218909, + "balance_loss_mlp": 1.18479085, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.11622805941353512, + "language_loss": 1.05362594, + "learning_rate": 0.000993250642561551, + "loss": 1.06581497, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.34106445, + "step": 420, + "time_per_iteration": 2.672421455383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181293, + "balance_loss_mlp": 1.14843905, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.10269562775159036, + "language_loss": 0.92318636, + "learning_rate": 0.0009931995306673466, + "loss": 0.93499923, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.32861328, + "step": 421, + "time_per_iteration": 2.7427923679351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168977, + "balance_loss_mlp": 1.13657558, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.11431346275656909, + "language_loss": 0.97376955, + "learning_rate": 0.000993148227296103, + "loss": 0.98545933, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.32397461, + "step": 422, + "time_per_iteration": 2.675947666168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151197, + "balance_loss_mlp": 1.12122786, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.0890704687176176, + "language_loss": 0.860506, + "learning_rate": 0.000993096732467738, + "loss": 0.87201798, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.29956055, + "step": 423, + "time_per_iteration": 3.060911178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_mlp": 1.15089989, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.1141297149032614, + "language_loss": 0.91752422, + "learning_rate": 0.0009930450462022435, + "loss": 0.9293741, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.34106445, + "step": 424, + "time_per_iteration": 2.8769121170043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233579, + "balance_loss_mlp": 1.20020068, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.046425192010764525, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80423385, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.33398438, + "step": 425, + "time_per_iteration": 4.897430896759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206307, + "balance_loss_mlp": 1.17078233, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.08757679589662427, + "language_loss": 0.89939743, + "learning_rate": 0.0009929410994402065, + "loss": 0.91146052, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35522461, + "step": 426, + "time_per_iteration": 3.8015847206115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247446, + "balance_loss_mlp": 1.21072912, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.09694830127406533, + "language_loss": 0.94969749, + "learning_rate": 0.0009928888389840196, + "loss": 0.96217191, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3671875, + "step": 427, + "time_per_iteration": 2.7042434215545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244821, + "balance_loss_mlp": 1.21010745, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.09892511285502391, + "language_loss": 0.97471511, + "learning_rate": 0.0009928363871714147, + "loss": 0.98716331, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.34716797, + "step": 428, + "time_per_iteration": 2.6848952770233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253097, + "balance_loss_mlp": 1.21733463, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.08269527052289877, + "language_loss": 0.91760862, + "learning_rate": 0.0009927837440227556, + "loss": 0.9301396, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.35766602, + "step": 429, + "time_per_iteration": 2.838052749633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215238, + "balance_loss_mlp": 1.18357563, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.07794556654442977, + "language_loss": 0.88257664, + "learning_rate": 0.0009927309095584798, + "loss": 0.89472902, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.31640625, + "step": 430, + "time_per_iteration": 3.010039806365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212644, + "balance_loss_mlp": 1.18246055, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.10632891775269031, + "language_loss": 0.96743113, + "learning_rate": 0.0009926778837991, + "loss": 0.97955757, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.30175781, + "step": 431, + "time_per_iteration": 2.734591484069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182523, + "balance_loss_mlp": 1.15226734, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.09435654496071201, + "language_loss": 0.9613564, + "learning_rate": 0.000992624666765202, + "loss": 0.97318161, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.30249023, + "step": 432, + "time_per_iteration": 2.829540252685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164349, + "balance_loss_mlp": 1.13523841, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.09286672234440549, + "language_loss": 0.93021452, + "learning_rate": 0.000992571258477447, + "loss": 0.94185793, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.29101562, + "step": 433, + "time_per_iteration": 2.8295536041259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154694, + "balance_loss_mlp": 1.12720466, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.10037104501236055, + "language_loss": 0.88638759, + "learning_rate": 0.0009925176589565695, + "loss": 0.89793456, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.27514648, + "step": 434, + "time_per_iteration": 2.8025705814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164119, + "balance_loss_mlp": 1.13445985, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.1154039733497609, + "language_loss": 0.97325677, + "learning_rate": 0.0009924638682233791, + "loss": 0.98489797, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.29663086, + "step": 435, + "time_per_iteration": 2.576300621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175635, + "balance_loss_mlp": 1.14626217, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.058007479940938765, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80740231, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.29296875, + "step": 436, + "time_per_iteration": 4.615980625152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203285, + "balance_loss_mlp": 1.17262459, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10734010742427191, + "language_loss": 0.87080061, + "learning_rate": 0.0009923557132036668, + "loss": 0.88283348, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.30664062, + "step": 437, + "time_per_iteration": 3.098910331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203826, + "balance_loss_mlp": 1.1721158, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.10713326361470918, + "language_loss": 0.92728174, + "learning_rate": 0.0009923013489591345, + "loss": 0.93932003, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.31713867, + "step": 438, + "time_per_iteration": 2.7423956394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198894, + "balance_loss_mlp": 1.16902053, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.10035753440716286, + "language_loss": 0.90567303, + "learning_rate": 0.0009922467935862681, + "loss": 0.91766196, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.29882812, + "step": 439, + "time_per_iteration": 3.1101534366607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205509, + "balance_loss_mlp": 1.17477679, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.11954514685823285, + "language_loss": 0.93942809, + "learning_rate": 0.0009921920471062478, + "loss": 0.95148319, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.30712891, + "step": 440, + "time_per_iteration": 2.600698947906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120422, + "balance_loss_mlp": 1.1727252, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.09556707126690236, + "language_loss": 0.90983319, + "learning_rate": 0.0009921371095403281, + "loss": 0.92187542, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.31518555, + "step": 441, + "time_per_iteration": 2.6733319759368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.19223797, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.07797244609311368, + "language_loss": 0.93788469, + "learning_rate": 0.0009920819809098379, + "loss": 0.95012105, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3137207, + "step": 442, + "time_per_iteration": 2.6183252334594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225883, + "balance_loss_mlp": 1.1949122, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.09885461493176821, + "language_loss": 0.89838576, + "learning_rate": 0.0009920266612361798, + "loss": 0.91064465, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.30957031, + "step": 443, + "time_per_iteration": 2.8172709941864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226855, + "balance_loss_mlp": 1.19721913, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.0888891387256682, + "language_loss": 0.90358502, + "learning_rate": 0.0009919711505408308, + "loss": 0.91585356, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.29614258, + "step": 444, + "time_per_iteration": 2.8260107040405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210635, + "balance_loss_mlp": 1.17978323, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.08298354336382399, + "language_loss": 0.88123727, + "learning_rate": 0.000991915448845342, + "loss": 0.89334357, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.30810547, + "step": 445, + "time_per_iteration": 2.5825653076171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189896, + "balance_loss_mlp": 1.16185772, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.079307281997536, + "language_loss": 0.97017783, + "learning_rate": 0.000991859556171339, + "loss": 0.98207676, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.28027344, + "step": 446, + "time_per_iteration": 2.60908579826355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169669, + "balance_loss_mlp": 1.14200044, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.12218297197997938, + "language_loss": 0.98194999, + "learning_rate": 0.000991803472540521, + "loss": 0.99364674, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.27648926, + "step": 447, + "time_per_iteration": 2.6712088584899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151986, + "balance_loss_mlp": 1.12646365, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.09227172547062512, + "language_loss": 0.94125748, + "learning_rate": 0.0009917471979746615, + "loss": 0.95277739, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.25549316, + "step": 448, + "time_per_iteration": 3.075975179672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168815, + "balance_loss_mlp": 1.1426959, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.08941799521583026, + "language_loss": 0.93856514, + "learning_rate": 0.0009916907324956086, + "loss": 0.95025325, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.26123047, + "step": 449, + "time_per_iteration": 2.736283540725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172223, + "balance_loss_mlp": 1.14490044, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.10083399298029862, + "language_loss": 0.89324713, + "learning_rate": 0.0009916340761252837, + "loss": 0.90496939, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.27331543, + "step": 450, + "time_per_iteration": 2.7378761768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159685, + "balance_loss_mlp": 1.13442445, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08549336336253632, + "language_loss": 0.87181985, + "learning_rate": 0.0009915772288856832, + "loss": 0.88341665, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.25268555, + "step": 451, + "time_per_iteration": 3.0766491889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155631, + "balance_loss_mlp": 1.12976265, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07927995723527953, + "language_loss": 0.88654345, + "learning_rate": 0.000991520190798877, + "loss": 0.89809978, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.2590332, + "step": 452, + "time_per_iteration": 2.838925838470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.13122344, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.12430534270573573, + "language_loss": 0.96291733, + "learning_rate": 0.0009914629618870089, + "loss": 0.97449821, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.26904297, + "step": 453, + "time_per_iteration": 2.902444839477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103846, + "balance_loss_mlp": 1.0800997, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.040702290127782634, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.7977972, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.23730469, + "step": 454, + "time_per_iteration": 4.758902072906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089921, + "balance_loss_mlp": 1.06579328, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.037925831915212815, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.8251788, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.24121094, + "step": 455, + "time_per_iteration": 4.819866180419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230669, + "balance_loss_mlp": 1.19860172, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.12072891758744606, + "language_loss": 0.9005816, + "learning_rate": 0.0009912901304235883, + "loss": 0.91288829, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.32055664, + "step": 456, + "time_per_iteration": 2.928392171859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251391, + "balance_loss_mlp": 1.21851277, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.11610109334616998, + "language_loss": 0.85792667, + "learning_rate": 0.000991232138434397, + "loss": 0.8704406, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.32885742, + "step": 457, + "time_per_iteration": 2.868086099624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268634, + "balance_loss_mlp": 1.23406374, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.1267050228562, + "language_loss": 0.92359412, + "learning_rate": 0.000991173955731976, + "loss": 0.93628043, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.34594727, + "step": 458, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225195, + "balance_loss_mlp": 1.19374788, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.08225728813848474, + "language_loss": 0.98437196, + "learning_rate": 0.0009911155823389137, + "loss": 0.99662387, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.31445312, + "step": 459, + "time_per_iteration": 3.052828550338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208858, + "balance_loss_mlp": 1.17938948, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.06750279925545952, + "language_loss": 0.93789524, + "learning_rate": 0.000991057018277873, + "loss": 0.94998378, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.29467773, + "step": 460, + "time_per_iteration": 2.7062363624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175577, + "balance_loss_mlp": 1.14656162, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.09934743705595177, + "language_loss": 0.93365753, + "learning_rate": 0.0009909982635715898, + "loss": 0.94541329, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.28979492, + "step": 461, + "time_per_iteration": 2.647963523864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163614, + "balance_loss_mlp": 1.13576651, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.09828505426249505, + "language_loss": 0.93045211, + "learning_rate": 0.0009909393182428751, + "loss": 0.94208831, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.27856445, + "step": 462, + "time_per_iteration": 2.6743412017822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163317, + "balance_loss_mlp": 1.13556552, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.08889819955039441, + "language_loss": 0.88051188, + "learning_rate": 0.000990880182314614, + "loss": 0.89214504, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.27758789, + "step": 463, + "time_per_iteration": 2.703216314315796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163709, + "balance_loss_mlp": 1.1364336, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.07282859671945509, + "language_loss": 0.89247352, + "learning_rate": 0.0009908208558097643, + "loss": 0.90411055, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.27319336, + "step": 464, + "time_per_iteration": 2.9412851333618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011721, + "balance_loss_mlp": 1.14410961, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.07278927788912996, + "language_loss": 0.90032935, + "learning_rate": 0.000990761338751359, + "loss": 0.91205037, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.2800293, + "step": 465, + "time_per_iteration": 2.7876837253570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_mlp": 1.0181731, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.02426695301026172, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74698591, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20800781, + "step": 466, + "time_per_iteration": 5.05983304977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189569, + "balance_loss_mlp": 1.16098237, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.07846733746050528, + "language_loss": 0.9248395, + "learning_rate": 0.0009906417330663815, + "loss": 0.93673521, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.28588867, + "step": 467, + "time_per_iteration": 2.696319103240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194386, + "balance_loss_mlp": 1.16539454, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.08323950534675657, + "language_loss": 0.88480067, + "learning_rate": 0.0009905816444862442, + "loss": 0.89674455, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.29003906, + "step": 468, + "time_per_iteration": 2.6381607055664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218295, + "balance_loss_mlp": 1.18875456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.07740224213463104, + "language_loss": 0.8706888, + "learning_rate": 0.0009905213654454216, + "loss": 0.88287175, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.29516602, + "step": 469, + "time_per_iteration": 2.9251277446746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229446, + "balance_loss_mlp": 1.19940567, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.08990381668478556, + "language_loss": 0.94001997, + "learning_rate": 0.0009904608959673158, + "loss": 0.95231444, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.30053711, + "step": 470, + "time_per_iteration": 2.812967538833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247261, + "balance_loss_mlp": 1.21679068, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.12209234788536222, + "language_loss": 0.92894399, + "learning_rate": 0.000990400236075403, + "loss": 0.94141662, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.3046875, + "step": 471, + "time_per_iteration": 2.5002622604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205406, + "balance_loss_mlp": 1.17622375, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.10180872621251921, + "language_loss": 0.91581351, + "learning_rate": 0.0009903393857932338, + "loss": 0.92786753, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.29150391, + "step": 472, + "time_per_iteration": 2.656669855117798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119747, + "balance_loss_mlp": 1.16866922, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.09565392288785843, + "language_loss": 0.88802767, + "learning_rate": 0.0009902783451444317, + "loss": 0.90000236, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.28808594, + "step": 473, + "time_per_iteration": 2.769510269165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118613, + "balance_loss_mlp": 1.16034544, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.10259894411844421, + "language_loss": 0.94123209, + "learning_rate": 0.0009902171141526956, + "loss": 0.95309335, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.25769043, + "step": 474, + "time_per_iteration": 2.523611545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119701, + "balance_loss_mlp": 1.17120147, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.11667434950480311, + "language_loss": 0.82319391, + "learning_rate": 0.000990155692841797, + "loss": 0.83516395, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.25817871, + "step": 475, + "time_per_iteration": 2.9675121307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227501, + "balance_loss_mlp": 1.20134616, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.09682112540143008, + "language_loss": 0.93477046, + "learning_rate": 0.0009900940812355818, + "loss": 0.94704551, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.26147461, + "step": 476, + "time_per_iteration": 2.924874782562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242063, + "balance_loss_mlp": 1.21519351, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.10139353171378648, + "language_loss": 0.88050354, + "learning_rate": 0.00099003227935797, + "loss": 0.89292419, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.26879883, + "step": 477, + "time_per_iteration": 2.7283573150634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261302, + "balance_loss_mlp": 1.23314476, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.08348382552066277, + "language_loss": 0.91095632, + "learning_rate": 0.000989970287232955, + "loss": 0.92356932, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.28149414, + "step": 478, + "time_per_iteration": 2.8266103267669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241973, + "balance_loss_mlp": 1.21583056, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.10737558840696987, + "language_loss": 0.89902192, + "learning_rate": 0.0009899081048846043, + "loss": 0.91144162, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.26135254, + "step": 479, + "time_per_iteration": 2.6420280933380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291965, + "balance_loss_mlp": 1.26427281, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.10012000168567356, + "language_loss": 0.93502498, + "learning_rate": 0.0009898457323370593, + "loss": 0.94794464, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.27697754, + "step": 480, + "time_per_iteration": 2.6065309047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246143, + "balance_loss_mlp": 1.21961892, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.0993880337212747, + "language_loss": 0.92708224, + "learning_rate": 0.000989783169614535, + "loss": 0.93954372, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.26525879, + "step": 481, + "time_per_iteration": 2.7099456787109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079427, + "balance_loss_mlp": 1.05930424, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.03505173716607146, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79832184, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.20117188, + "step": 482, + "time_per_iteration": 4.890375852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231966, + "balance_loss_mlp": 1.20573974, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10137363964482546, + "language_loss": 0.90139151, + "learning_rate": 0.000989657473741779, + "loss": 0.91371119, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.2623291, + "step": 483, + "time_per_iteration": 2.9370570182800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207368, + "balance_loss_mlp": 1.18004489, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.08498377120371232, + "language_loss": 0.9143101, + "learning_rate": 0.0009895943406403465, + "loss": 0.92638373, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.2734375, + "step": 484, + "time_per_iteration": 2.7508950233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207641, + "balance_loss_mlp": 1.17798209, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.09176142665566275, + "language_loss": 0.84377563, + "learning_rate": 0.0009895310174615338, + "loss": 0.85585213, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.29638672, + "step": 485, + "time_per_iteration": 2.785452365875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111377, + "balance_loss_mlp": 1.09211314, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.060723434374539316, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76829892, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.19238281, + "step": 486, + "time_per_iteration": 4.6911780834198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119428, + "balance_loss_mlp": 1.16636121, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.10396612544825783, + "language_loss": 0.89653724, + "learning_rate": 0.0009894038009701782, + "loss": 0.90848005, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.27954102, + "step": 487, + "time_per_iteration": 2.6375234127044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240132, + "balance_loss_mlp": 1.20847011, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.09761423787564506, + "language_loss": 0.88893723, + "learning_rate": 0.0009893399077070253, + "loss": 0.90133858, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31640625, + "step": 488, + "time_per_iteration": 2.63673734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217529, + "balance_loss_mlp": 1.18844151, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.08578938939363263, + "language_loss": 0.87286389, + "learning_rate": 0.0009892758244652718, + "loss": 0.88503921, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29077148, + "step": 489, + "time_per_iteration": 2.6579813957214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226074, + "balance_loss_mlp": 1.19698668, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.10664482488995004, + "language_loss": 0.91801828, + "learning_rate": 0.0009892115512697968, + "loss": 0.93027902, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.29101562, + "step": 490, + "time_per_iteration": 2.744812250137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208984, + "balance_loss_mlp": 1.18106508, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.07150484911777356, + "language_loss": 0.94226933, + "learning_rate": 0.0009891470881455537, + "loss": 0.95435917, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.27905273, + "step": 491, + "time_per_iteration": 2.7888436317443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184692, + "balance_loss_mlp": 1.15854979, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08034794474061628, + "language_loss": 0.91272295, + "learning_rate": 0.0009890824351175692, + "loss": 0.92456985, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.26184082, + "step": 492, + "time_per_iteration": 2.6893324851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168375, + "balance_loss_mlp": 1.1430074, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.11413207975143042, + "language_loss": 0.96479064, + "learning_rate": 0.0009890175922109435, + "loss": 0.9764744, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.25378418, + "step": 493, + "time_per_iteration": 2.678849935531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184878, + "balance_loss_mlp": 1.15874791, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.08018179898504754, + "language_loss": 0.9392823, + "learning_rate": 0.0009889525594508513, + "loss": 0.95113099, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.26147461, + "step": 494, + "time_per_iteration": 3.067603349685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118757, + "balance_loss_mlp": 1.16171312, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.06605208103828443, + "language_loss": 0.88701022, + "learning_rate": 0.0009888873368625404, + "loss": 0.89888591, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.25891113, + "step": 495, + "time_per_iteration": 2.513042688369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208336, + "balance_loss_mlp": 1.18301558, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.21045205282495727, + "language_loss": 0.923554, + "learning_rate": 0.0009888219244713326, + "loss": 0.93563735, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.2532959, + "step": 496, + "time_per_iteration": 2.867083787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121286, + "balance_loss_mlp": 1.18638349, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.11531388313037762, + "language_loss": 0.9129262, + "learning_rate": 0.0009887563223026229, + "loss": 0.92505479, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.26501465, + "step": 497, + "time_per_iteration": 2.708878993988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251726, + "balance_loss_mlp": 1.23503661, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.1018924396807409, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80319893, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.16699219, + "step": 498, + "time_per_iteration": 4.9335105419158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203662, + "balance_loss_mlp": 1.1767087, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.15301104951645897, + "language_loss": 0.9155978, + "learning_rate": 0.0009886245487346482, + "loss": 0.92763442, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.2701416, + "step": 499, + "time_per_iteration": 3.048356771469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012374, + "balance_loss_mlp": 1.20936203, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.11445293306924414, + "language_loss": 0.93613195, + "learning_rate": 0.0009885583773865422, + "loss": 0.948506, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.28076172, + "step": 500, + "time_per_iteration": 2.4812135696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124857, + "balance_loss_mlp": 1.21948266, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.08673144300895683, + "language_loss": 0.91201293, + "learning_rate": 0.0009884920163632524, + "loss": 0.92449856, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2911377, + "step": 501, + "time_per_iteration": 2.6971659660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267642, + "balance_loss_mlp": 1.23836422, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.09557615258578338, + "language_loss": 0.9327184, + "learning_rate": 0.000988425465690543, + "loss": 0.94539481, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.29296875, + "step": 502, + "time_per_iteration": 2.6156561374664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125655, + "balance_loss_mlp": 1.22724855, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.09431767215346384, + "language_loss": 0.90255487, + "learning_rate": 0.0009883587253942505, + "loss": 0.91512042, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.29284668, + "step": 503, + "time_per_iteration": 2.8239471912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284099, + "balance_loss_mlp": 1.25420117, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.11394137891765209, + "language_loss": 0.96597123, + "learning_rate": 0.0009882917955002862, + "loss": 0.97881228, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.29907227, + "step": 504, + "time_per_iteration": 2.603328227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229149, + "balance_loss_mlp": 1.2021842, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.09281538791599028, + "language_loss": 0.89550316, + "learning_rate": 0.0009882246760346343, + "loss": 0.90779471, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.26977539, + "step": 505, + "time_per_iteration": 2.681687831878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229765, + "balance_loss_mlp": 1.20144057, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.10637320281066408, + "language_loss": 0.9312228, + "learning_rate": 0.0009881573670233533, + "loss": 0.94352043, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.28295898, + "step": 506, + "time_per_iteration": 2.5317869186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210552, + "balance_loss_mlp": 1.18480229, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.08668641437707587, + "language_loss": 0.88418829, + "learning_rate": 0.0009880898684925747, + "loss": 0.89629376, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.25769043, + "step": 507, + "time_per_iteration": 2.7037086486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171606, + "balance_loss_mlp": 1.14662004, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09301682260046856, + "language_loss": 0.8754462, + "learning_rate": 0.0009880221804685037, + "loss": 0.88716233, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.24987793, + "step": 508, + "time_per_iteration": 2.5904412269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132812, + "balance_loss_mlp": 1.11431122, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.05770369236985839, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80477232, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18457031, + "step": 509, + "time_per_iteration": 4.754728317260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155131, + "balance_loss_mlp": 1.12993002, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.08546011105886044, + "language_loss": 0.93283963, + "learning_rate": 0.0009878862360456733, + "loss": 0.94439089, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.25219727, + "step": 510, + "time_per_iteration": 2.7473011016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139729, + "balance_loss_mlp": 1.11480212, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.09364527364696289, + "language_loss": 0.86814249, + "learning_rate": 0.0009878179796996922, + "loss": 0.87953973, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.24926758, + "step": 511, + "time_per_iteration": 2.74253249168396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157281, + "balance_loss_mlp": 1.13087618, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.0728025857811697, + "language_loss": 0.90271652, + "learning_rate": 0.0009877495339659754, + "loss": 0.91428936, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.26428223, + "step": 512, + "time_per_iteration": 2.7383904457092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011795, + "balance_loss_mlp": 1.15373945, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.08851969035528326, + "language_loss": 0.84944135, + "learning_rate": 0.000987680898871096, + "loss": 0.86123633, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "step": 513, + "time_per_iteration": 2.7576277256011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213945, + "balance_loss_mlp": 1.18686032, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10650793826837307, + "language_loss": 0.85207206, + "learning_rate": 0.0009876120744417, + "loss": 0.8642115, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.27075195, + "step": 514, + "time_per_iteration": 2.9868528842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205226, + "balance_loss_mlp": 1.17891693, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.12423818842648264, + "language_loss": 0.94048339, + "learning_rate": 0.0009875430607045078, + "loss": 0.95253563, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2635498, + "step": 515, + "time_per_iteration": 2.6809887886047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217575, + "balance_loss_mlp": 1.19226718, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.09121928261100491, + "language_loss": 0.90633368, + "learning_rate": 0.000987473857686313, + "loss": 0.91850942, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.2532959, + "step": 516, + "time_per_iteration": 2.821868896484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273678, + "balance_loss_mlp": 1.24556851, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.10235865570139392, + "language_loss": 0.92397732, + "learning_rate": 0.0009874044654139824, + "loss": 0.93671417, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.28125, + "step": 517, + "time_per_iteration": 2.754556894302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269361, + "balance_loss_mlp": 1.24070311, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.1033638080855083, + "language_loss": 0.91346741, + "learning_rate": 0.0009873348839144563, + "loss": 0.92616105, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.28662109, + "step": 518, + "time_per_iteration": 2.5521421432495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223264, + "balance_loss_mlp": 1.19750261, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.08349046242237956, + "language_loss": 0.9484781, + "learning_rate": 0.000987265113214749, + "loss": 0.96071064, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.25793457, + "step": 519, + "time_per_iteration": 2.5728440284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209741, + "balance_loss_mlp": 1.18294215, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.0925674463481217, + "language_loss": 0.93808675, + "learning_rate": 0.0009871951533419476, + "loss": 0.95018411, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.26794434, + "step": 520, + "time_per_iteration": 2.720158576965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173015, + "balance_loss_mlp": 1.14725351, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.08576102326010304, + "language_loss": 0.87117791, + "learning_rate": 0.0009871250043232132, + "loss": 0.88290811, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.25769043, + "step": 521, + "time_per_iteration": 2.7765281200408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167625, + "balance_loss_mlp": 1.14231658, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08176746103179605, + "language_loss": 0.85016751, + "learning_rate": 0.0009870546661857797, + "loss": 0.86184376, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.25317383, + "step": 522, + "time_per_iteration": 2.621741771697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192442, + "balance_loss_mlp": 1.16581106, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.1034937566099096, + "language_loss": 0.93671012, + "learning_rate": 0.0009869841389569553, + "loss": 0.94863456, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.26647949, + "step": 523, + "time_per_iteration": 2.9877190589904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176234, + "balance_loss_mlp": 1.15106893, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.09839184495226623, + "language_loss": 0.87745041, + "learning_rate": 0.0009869134226641206, + "loss": 0.88921273, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.25170898, + "step": 524, + "time_per_iteration": 2.5881335735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182922, + "balance_loss_mlp": 1.15635061, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.08405321822424026, + "language_loss": 0.86857122, + "learning_rate": 0.0009868425173347303, + "loss": 0.88040042, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.26599121, + "step": 525, + "time_per_iteration": 2.66532301902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171089, + "balance_loss_mlp": 1.14556646, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.08405786654151125, + "language_loss": 0.94851571, + "learning_rate": 0.0009867714229963125, + "loss": 0.96022666, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.25549316, + "step": 526, + "time_per_iteration": 2.8129477500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180551, + "balance_loss_mlp": 1.15515995, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.0887042459069511, + "language_loss": 0.92144597, + "learning_rate": 0.000986700139676468, + "loss": 0.93325144, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.25402832, + "step": 527, + "time_per_iteration": 2.5864803791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221342, + "balance_loss_mlp": 1.19498479, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.0908626798732068, + "language_loss": 0.89802891, + "learning_rate": 0.0009866286674028717, + "loss": 0.91024232, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.26379395, + "step": 528, + "time_per_iteration": 2.6321539878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195715, + "balance_loss_mlp": 1.1701684, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.10105960014250041, + "language_loss": 0.86296791, + "learning_rate": 0.0009865570062032717, + "loss": 0.87492502, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.25561523, + "step": 529, + "time_per_iteration": 2.9451780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180062, + "balance_loss_mlp": 1.15431321, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.07153867300670864, + "language_loss": 0.9169668, + "learning_rate": 0.0009864851561054893, + "loss": 0.92876744, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.25756836, + "step": 530, + "time_per_iteration": 2.829380512237549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138967, + "balance_loss_mlp": 1.11554205, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.07949334936814403, + "language_loss": 0.90603149, + "learning_rate": 0.0009864131171374191, + "loss": 0.9174211, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23413086, + "step": 531, + "time_per_iteration": 2.7103002071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144785, + "balance_loss_mlp": 1.12042999, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.09480674153197077, + "language_loss": 0.89899409, + "learning_rate": 0.0009863408893270292, + "loss": 0.91044188, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.24353027, + "step": 532, + "time_per_iteration": 2.800015926361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135275, + "balance_loss_mlp": 1.11101604, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.12452848702407365, + "language_loss": 0.84814823, + "learning_rate": 0.0009862684727023605, + "loss": 0.85950094, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.24243164, + "step": 533, + "time_per_iteration": 2.733250856399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142831, + "balance_loss_mlp": 1.11813033, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10251703935298907, + "language_loss": 0.88274956, + "learning_rate": 0.0009861958672915283, + "loss": 0.89417779, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.24707031, + "step": 534, + "time_per_iteration": 2.8380610942840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151783, + "balance_loss_mlp": 1.12847757, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.08316309975829886, + "language_loss": 0.88756025, + "learning_rate": 0.0009861230731227201, + "loss": 0.89907813, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.23291016, + "step": 535, + "time_per_iteration": 2.871997594833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188729, + "balance_loss_mlp": 1.16410041, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.08198011669981227, + "language_loss": 0.89923763, + "learning_rate": 0.0009860500902241973, + "loss": 0.91112483, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.24633789, + "step": 536, + "time_per_iteration": 2.623779058456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200536, + "balance_loss_mlp": 1.17560923, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.07805911222058415, + "language_loss": 0.94478881, + "learning_rate": 0.0009859769186242942, + "loss": 0.95679414, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.24914551, + "step": 537, + "time_per_iteration": 2.580596923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237375, + "balance_loss_mlp": 1.21290088, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.07373890024349967, + "language_loss": 0.87774181, + "learning_rate": 0.0009859035583514187, + "loss": 0.89011556, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.24450684, + "step": 538, + "time_per_iteration": 2.6570377349853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278522, + "balance_loss_mlp": 1.25283265, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.09282522264822365, + "language_loss": 0.89254487, + "learning_rate": 0.0009858300094340517, + "loss": 0.90533006, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.25720215, + "step": 539, + "time_per_iteration": 2.787065267562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129717, + "balance_loss_mlp": 1.271981, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.12009350418361847, + "language_loss": 0.84273541, + "learning_rate": 0.0009857562719007473, + "loss": 0.85570705, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.2520752, + "step": 540, + "time_per_iteration": 2.60508394241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269758, + "balance_loss_mlp": 1.24520063, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.09915993306854447, + "language_loss": 0.86265039, + "learning_rate": 0.0009856823457801331, + "loss": 0.87534791, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.24560547, + "step": 541, + "time_per_iteration": 2.916395664215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200462, + "balance_loss_mlp": 1.17673898, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08980852435022621, + "language_loss": 0.93430036, + "learning_rate": 0.00098560823110091, + "loss": 0.94630498, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.23718262, + "step": 542, + "time_per_iteration": 2.6473944187164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011666, + "balance_loss_mlp": 1.14424849, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.09857064774686858, + "language_loss": 0.94166034, + "learning_rate": 0.000985533927891851, + "loss": 0.95332634, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22338867, + "step": 543, + "time_per_iteration": 2.7833001613616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152313, + "balance_loss_mlp": 1.13023496, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.11299567756475092, + "language_loss": 0.91803026, + "learning_rate": 0.0009854594361818044, + "loss": 0.92955339, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.22070312, + "step": 544, + "time_per_iteration": 2.7342488765716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145926, + "balance_loss_mlp": 1.12322879, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.10706082764174026, + "language_loss": 0.90779245, + "learning_rate": 0.0009853847559996897, + "loss": 0.91925174, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22680664, + "step": 545, + "time_per_iteration": 2.7671496868133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115478, + "balance_loss_mlp": 1.13199878, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.09298705322285353, + "language_loss": 0.90420544, + "learning_rate": 0.0009853098873745, + "loss": 0.91575325, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.2277832, + "step": 546, + "time_per_iteration": 3.0312061309814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114995, + "balance_loss_mlp": 1.12715745, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.08666617811450783, + "language_loss": 0.89437926, + "learning_rate": 0.0009852348303353027, + "loss": 0.90587872, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22802734, + "step": 547, + "time_per_iteration": 2.8053338527679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175894, + "balance_loss_mlp": 1.15260065, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.07202142444648872, + "language_loss": 0.8282218, + "learning_rate": 0.000985159584911237, + "loss": 0.83998078, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.23291016, + "step": 548, + "time_per_iteration": 3.168396472930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200066, + "balance_loss_mlp": 1.17569995, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.0989690478360349, + "language_loss": 0.89268672, + "learning_rate": 0.0009850841511315162, + "loss": 0.9046874, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.24365234, + "step": 549, + "time_per_iteration": 2.6511220932006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205335, + "balance_loss_mlp": 1.18058681, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.10906170470493136, + "language_loss": 0.90274942, + "learning_rate": 0.0009850085290254256, + "loss": 0.91480273, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.24755859, + "step": 550, + "time_per_iteration": 2.8123652935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166073, + "balance_loss_mlp": 1.14193285, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.06887070936512274, + "language_loss": 0.8779422, + "learning_rate": 0.0009849327186223246, + "loss": 0.88960296, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.24121094, + "step": 551, + "time_per_iteration": 2.780959129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144115, + "balance_loss_mlp": 1.12010658, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.1035499947998288, + "language_loss": 0.94864386, + "learning_rate": 0.000984856719951646, + "loss": 0.96008497, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.23986816, + "step": 552, + "time_per_iteration": 2.599581718444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135304, + "balance_loss_mlp": 1.1112473, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.08131430219430819, + "language_loss": 0.91351348, + "learning_rate": 0.0009847805330428943, + "loss": 0.92486656, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24035645, + "step": 553, + "time_per_iteration": 2.9599480628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126363, + "balance_loss_mlp": 1.1017344, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.10883623187773357, + "language_loss": 0.92631853, + "learning_rate": 0.0009847041579256481, + "loss": 0.93758214, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.24633789, + "step": 554, + "time_per_iteration": 2.592348575592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139312, + "balance_loss_mlp": 1.11518431, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08685206815428315, + "language_loss": 0.94236493, + "learning_rate": 0.0009846275946295592, + "loss": 0.95375812, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.2409668, + "step": 555, + "time_per_iteration": 2.6748178005218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157684, + "balance_loss_mlp": 1.13367498, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.1423144419608042, + "language_loss": 0.86826319, + "learning_rate": 0.0009845508431843518, + "loss": 0.87984002, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23974609, + "step": 556, + "time_per_iteration": 3.0652637481689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188505, + "balance_loss_mlp": 1.16398418, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.08544226719489541, + "language_loss": 0.87931871, + "learning_rate": 0.0009844739036198233, + "loss": 0.89120376, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.24523926, + "step": 557, + "time_per_iteration": 2.667473793029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210589, + "balance_loss_mlp": 1.18594849, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.07677269921737997, + "language_loss": 0.9440788, + "learning_rate": 0.0009843967759658448, + "loss": 0.95618474, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.24658203, + "step": 558, + "time_per_iteration": 2.7628064155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132066, + "balance_loss_mlp": 1.11194348, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.0590422913979422, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73899817, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.20117188, + "step": 559, + "time_per_iteration": 4.902129888534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241703, + "balance_loss_mlp": 1.21570349, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.0867010736609256, + "language_loss": 0.9488945, + "learning_rate": 0.000984241956509384, + "loss": 0.96131158, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.26025391, + "step": 560, + "time_per_iteration": 2.6891891956329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208783, + "balance_loss_mlp": 1.18289042, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08963888455934524, + "language_loss": 0.90658677, + "learning_rate": 0.0009841642647670078, + "loss": 0.91867459, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.25927734, + "step": 561, + "time_per_iteration": 2.563408613204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198224, + "balance_loss_mlp": 1.17229605, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08487676980325562, + "language_loss": 0.85033154, + "learning_rate": 0.0009840863850553944, + "loss": 0.86231375, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.25964355, + "step": 562, + "time_per_iteration": 2.9805734157562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183286, + "balance_loss_mlp": 1.157763, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.08249773787970602, + "language_loss": 0.90893888, + "learning_rate": 0.0009840083174047782, + "loss": 0.92077172, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.25537109, + "step": 563, + "time_per_iteration": 2.7391836643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194315, + "balance_loss_mlp": 1.16986513, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.07051664629026161, + "language_loss": 0.85589021, + "learning_rate": 0.0009839300618454685, + "loss": 0.86783338, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.24438477, + "step": 564, + "time_per_iteration": 2.89290452003479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194246, + "balance_loss_mlp": 1.16989148, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.07367825547097939, + "language_loss": 0.91287452, + "learning_rate": 0.0009838516184078466, + "loss": 0.92481697, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.24353027, + "step": 565, + "time_per_iteration": 2.8416025638580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201114, + "balance_loss_mlp": 1.17573452, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.11472314835583913, + "language_loss": 0.88207066, + "learning_rate": 0.0009837729871223669, + "loss": 0.89408183, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.25402832, + "step": 566, + "time_per_iteration": 2.6492197513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249487, + "balance_loss_mlp": 1.22309399, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.07200956845133732, + "language_loss": 0.88285792, + "learning_rate": 0.0009836941680195568, + "loss": 0.89535284, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.2644043, + "step": 567, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124424, + "balance_loss_mlp": 1.21801353, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.08672877457635139, + "language_loss": 0.83671671, + "learning_rate": 0.0009836151611300166, + "loss": 0.84915912, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.26245117, + "step": 568, + "time_per_iteration": 3.2202959060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232018, + "balance_loss_mlp": 1.2069366, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0737206182188589, + "language_loss": 0.9499715, + "learning_rate": 0.0009835359664844194, + "loss": 0.96229166, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.25097656, + "step": 569, + "time_per_iteration": 2.6723880767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115473, + "balance_loss_mlp": 1.09935594, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.05305645754414589, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82152283, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.16113281, + "step": 570, + "time_per_iteration": 4.934283494949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262159, + "balance_loss_mlp": 1.23583817, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.0759630537733653, + "language_loss": 0.91932368, + "learning_rate": 0.0009833770140481118, + "loss": 0.93194532, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.2635498, + "step": 571, + "time_per_iteration": 2.6325361728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240536, + "balance_loss_mlp": 1.21385729, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.07085220990305834, + "language_loss": 0.82309085, + "learning_rate": 0.000983297256319112, + "loss": 0.83549619, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.26733398, + "step": 572, + "time_per_iteration": 3.230297088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227341, + "balance_loss_mlp": 1.20004177, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.0905445578460947, + "language_loss": 0.86770016, + "learning_rate": 0.000983217310957477, + "loss": 0.87997353, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.27319336, + "step": 573, + "time_per_iteration": 2.8283607959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230404, + "balance_loss_mlp": 1.20267606, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08397098324277796, + "language_loss": 0.89933473, + "learning_rate": 0.000983137177994244, + "loss": 0.91163886, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.27734375, + "step": 574, + "time_per_iteration": 2.945197820663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184809, + "balance_loss_mlp": 1.15805852, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08995501683398337, + "language_loss": 0.85942268, + "learning_rate": 0.0009830568574605235, + "loss": 0.87127078, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.26782227, + "step": 575, + "time_per_iteration": 2.9714908599853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173303, + "balance_loss_mlp": 1.14733911, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.11617237422906017, + "language_loss": 0.87585467, + "learning_rate": 0.0009829763493874992, + "loss": 0.88758773, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.2598877, + "step": 576, + "time_per_iteration": 3.0522892475128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185519, + "balance_loss_mlp": 1.15929341, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.07800734946110352, + "language_loss": 0.92923808, + "learning_rate": 0.0009828956538064264, + "loss": 0.94109321, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.26245117, + "step": 577, + "time_per_iteration": 2.8397951126098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198525, + "balance_loss_mlp": 1.17312193, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07768178407950788, + "language_loss": 0.90871215, + "learning_rate": 0.0009828147707486344, + "loss": 0.92069739, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.25427246, + "step": 578, + "time_per_iteration": 2.714322805404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120727, + "balance_loss_mlp": 1.18262911, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.08360568840749934, + "language_loss": 0.86554426, + "learning_rate": 0.0009827337002455245, + "loss": 0.877617, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.24645996, + "step": 579, + "time_per_iteration": 2.742311477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195367, + "balance_loss_mlp": 1.17049956, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07475116375685303, + "language_loss": 0.87853694, + "learning_rate": 0.0009826524423285712, + "loss": 0.89049065, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.24865723, + "step": 580, + "time_per_iteration": 3.014310121536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212273, + "balance_loss_mlp": 1.18770432, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.09493717034802315, + "language_loss": 0.88884461, + "learning_rate": 0.0009825709970293218, + "loss": 0.90096736, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2454834, + "step": 581, + "time_per_iteration": 3.004209518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215399, + "balance_loss_mlp": 1.19164097, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.0873103144771369, + "language_loss": 0.95079505, + "learning_rate": 0.0009824893643793956, + "loss": 0.96294904, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.23754883, + "step": 582, + "time_per_iteration": 3.0893442630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220934, + "balance_loss_mlp": 1.1956501, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.08836320076119632, + "language_loss": 0.87841964, + "learning_rate": 0.0009824075444104857, + "loss": 0.89062899, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.25280762, + "step": 583, + "time_per_iteration": 2.7537503242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239656, + "balance_loss_mlp": 1.21521807, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.16884309783149784, + "language_loss": 0.93345737, + "learning_rate": 0.000982325537154357, + "loss": 0.94585395, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.24450684, + "step": 584, + "time_per_iteration": 2.59409499168396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211835, + "balance_loss_mlp": 1.18743277, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.08768097982415915, + "language_loss": 0.93578511, + "learning_rate": 0.0009822433426428484, + "loss": 0.94790351, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.24401855, + "step": 585, + "time_per_iteration": 2.581516742706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190012, + "balance_loss_mlp": 1.16627765, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.09638114373338931, + "language_loss": 0.8707509, + "learning_rate": 0.0009821609609078697, + "loss": 0.88265103, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.23730469, + "step": 586, + "time_per_iteration": 2.6160855293273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192266, + "balance_loss_mlp": 1.16885376, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.08368187760832956, + "language_loss": 0.89230156, + "learning_rate": 0.0009820783919814045, + "loss": 0.90422428, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.23425293, + "step": 587, + "time_per_iteration": 2.8534207344055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168851, + "balance_loss_mlp": 1.14552212, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.1429978790264596, + "language_loss": 0.82743758, + "learning_rate": 0.0009819956358955095, + "loss": 0.83912605, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.2331543, + "step": 588, + "time_per_iteration": 2.5901453495025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173293, + "balance_loss_mlp": 1.14966619, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.08588056281957461, + "language_loss": 0.84002471, + "learning_rate": 0.0009819126926823127, + "loss": 0.85175765, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23608398, + "step": 589, + "time_per_iteration": 2.530374765396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202956, + "balance_loss_mlp": 1.17918611, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.07487704505114483, + "language_loss": 0.86892301, + "learning_rate": 0.000981829562374016, + "loss": 0.88095254, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.2376709, + "step": 590, + "time_per_iteration": 2.8030459880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244494, + "balance_loss_mlp": 1.22037804, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.12123010147526934, + "language_loss": 0.97345364, + "learning_rate": 0.0009817462450028933, + "loss": 0.98589861, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.24108887, + "step": 591, + "time_per_iteration": 2.7129569053649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233399, + "balance_loss_mlp": 1.20995021, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.08245604807530345, + "language_loss": 0.85052103, + "learning_rate": 0.0009816627406012916, + "loss": 0.86285496, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.23425293, + "step": 592, + "time_per_iteration": 2.8424665927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218738, + "balance_loss_mlp": 1.19550395, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.128701110372521, + "language_loss": 0.84672415, + "learning_rate": 0.0009815790492016295, + "loss": 0.85891157, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.23217773, + "step": 593, + "time_per_iteration": 2.95451283454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171348, + "balance_loss_mlp": 1.14887691, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.14505795416516268, + "language_loss": 0.86793518, + "learning_rate": 0.0009814951708363993, + "loss": 0.87964857, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.22473145, + "step": 594, + "time_per_iteration": 2.85953950881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125332, + "balance_loss_mlp": 1.11083615, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.044045371588173315, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79116321, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.14453125, + "step": 595, + "time_per_iteration": 4.819102048873901 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116091, + "balance_loss_mlp": 1.09400165, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.15046985558242026, + "language_loss": 0.88265449, + "learning_rate": 0.0009813268533395648, + "loss": 0.8938154, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.2208252, + "step": 596, + "time_per_iteration": 2.5988821983337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127071, + "balance_loss_mlp": 1.10389698, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12036284201424394, + "language_loss": 0.87534207, + "learning_rate": 0.0009812424142733073, + "loss": 0.88661277, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.23168945, + "step": 597, + "time_per_iteration": 2.5434508323669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011341, + "balance_loss_mlp": 1.11084187, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.21736642596268407, + "language_loss": 0.85729969, + "learning_rate": 0.000981157788372175, + "loss": 0.86864072, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.23242188, + "step": 598, + "time_per_iteration": 3.0409185886383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140706, + "balance_loss_mlp": 1.11694789, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.09609751014588512, + "language_loss": 0.89140439, + "learning_rate": 0.0009810729756690223, + "loss": 0.90281147, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.23742676, + "step": 599, + "time_per_iteration": 2.7512025833129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149306, + "balance_loss_mlp": 1.12485611, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09347854332414611, + "language_loss": 0.92009699, + "learning_rate": 0.0009809879761967766, + "loss": 0.93159008, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.24438477, + "step": 600, + "time_per_iteration": 2.966771364212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114736, + "balance_loss_mlp": 1.1223377, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11723124982013416, + "language_loss": 0.86307055, + "learning_rate": 0.0009809027899884378, + "loss": 0.87454414, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.25036621, + "step": 601, + "time_per_iteration": 2.960700273513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160833, + "balance_loss_mlp": 1.13693142, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.1190375758971125, + "language_loss": 0.88418448, + "learning_rate": 0.0009808174170770779, + "loss": 0.89579284, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.2388916, + "step": 602, + "time_per_iteration": 2.8176493644714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012435, + "balance_loss_mlp": 0.99622273, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.011178693541089954, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85910678, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.16210938, + "step": 603, + "time_per_iteration": 4.909565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_mlp": 1.24103987, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.08512052059651275, + "language_loss": 0.93440074, + "learning_rate": 0.0009806461112779462, + "loss": 0.94705629, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.24511719, + "step": 604, + "time_per_iteration": 2.658644199371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134615, + "balance_loss_mlp": 1.3188746, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.21802356099424494, + "language_loss": 0.87949467, + "learning_rate": 0.0009805601784566814, + "loss": 0.89295614, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.27294922, + "step": 605, + "time_per_iteration": 2.5276598930358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334827, + "balance_loss_mlp": 1.30897105, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.1053210941194693, + "language_loss": 0.95447874, + "learning_rate": 0.0009804740590654089, + "loss": 0.96782702, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.25854492, + "step": 606, + "time_per_iteration": 2.6621856689453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237167, + "balance_loss_mlp": 1.2128365, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09607271254678196, + "language_loss": 0.89416385, + "learning_rate": 0.0009803877531375635, + "loss": 0.90653551, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.2434082, + "step": 607, + "time_per_iteration": 2.8813462257385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219293, + "balance_loss_mlp": 1.19459295, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.08760560664793143, + "language_loss": 0.90707058, + "learning_rate": 0.0009803012607066523, + "loss": 0.91926354, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.24707031, + "step": 608, + "time_per_iteration": 2.7780392169952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185589, + "balance_loss_mlp": 1.16223621, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.10290817733218703, + "language_loss": 0.89330381, + "learning_rate": 0.0009802145818062543, + "loss": 0.90515971, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.23339844, + "step": 609, + "time_per_iteration": 2.713611364364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189107, + "balance_loss_mlp": 1.16636157, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.1057697966066493, + "language_loss": 0.91819966, + "learning_rate": 0.0009801277164700212, + "loss": 0.93009067, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.22741699, + "step": 610, + "time_per_iteration": 2.575333595275879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207403, + "balance_loss_mlp": 1.18378794, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.09616788336185009, + "language_loss": 0.89864278, + "learning_rate": 0.0009800406647316776, + "loss": 0.91071677, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.23608398, + "step": 611, + "time_per_iteration": 2.831953287124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156372, + "balance_loss_mlp": 1.14006376, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.06675579160113412, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78070831, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.16308594, + "step": 612, + "time_per_iteration": 4.820984840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252443, + "balance_loss_mlp": 1.22860086, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.12351306502077156, + "language_loss": 0.8851943, + "learning_rate": 0.000979866002183916, + "loss": 0.89771867, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.23815918, + "step": 613, + "time_per_iteration": 2.6552364826202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233257, + "balance_loss_mlp": 1.20900965, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.09504576379881025, + "language_loss": 0.8953172, + "learning_rate": 0.0009797783914423082, + "loss": 0.90764976, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.24243164, + "step": 614, + "time_per_iteration": 2.8509650230407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120351, + "balance_loss_mlp": 1.18043077, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09364161863028009, + "language_loss": 0.8453747, + "learning_rate": 0.0009796905944342094, + "loss": 0.85740978, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.23071289, + "step": 615, + "time_per_iteration": 2.8491313457489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204695, + "balance_loss_mlp": 1.18137729, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.07677248067083364, + "language_loss": 0.88213146, + "learning_rate": 0.0009796026111937057, + "loss": 0.89417839, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.2331543, + "step": 616, + "time_per_iteration": 2.601902484893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165796, + "balance_loss_mlp": 1.14331329, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.0938738615494663, + "language_loss": 0.88620937, + "learning_rate": 0.0009795144417549552, + "loss": 0.89786732, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.22473145, + "step": 617, + "time_per_iteration": 2.7134363651275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168398, + "balance_loss_mlp": 1.14661872, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.10272804913481705, + "language_loss": 0.89757544, + "learning_rate": 0.0009794260861521883, + "loss": 0.90925944, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.21801758, + "step": 618, + "time_per_iteration": 2.831108331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156602, + "balance_loss_mlp": 1.13393998, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.1607893611237687, + "language_loss": 0.87325203, + "learning_rate": 0.0009793375444197075, + "loss": 0.88481802, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.2265625, + "step": 619, + "time_per_iteration": 2.6383235454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174094, + "balance_loss_mlp": 1.15122962, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.10347254391959168, + "language_loss": 0.85134327, + "learning_rate": 0.000979248816591888, + "loss": 0.8630842, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.22875977, + "step": 620, + "time_per_iteration": 2.7817084789276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.16314173, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.09880033160570031, + "language_loss": 0.85983694, + "learning_rate": 0.0009791599027031766, + "loss": 0.87169874, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.23010254, + "step": 621, + "time_per_iteration": 3.0790488719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202902, + "balance_loss_mlp": 1.17933416, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.0888737424862181, + "language_loss": 0.85755396, + "learning_rate": 0.0009790708027880932, + "loss": 0.86958289, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.2355957, + "step": 622, + "time_per_iteration": 2.839409351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148001, + "balance_loss_mlp": 1.13073957, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.05973140246409555, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78575295, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.17285156, + "step": 623, + "time_per_iteration": 4.827035665512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208498, + "balance_loss_mlp": 1.18456042, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.14072799304068395, + "language_loss": 0.92775166, + "learning_rate": 0.0009788920450172487, + "loss": 0.93983662, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.23925781, + "step": 624, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186727, + "balance_loss_mlp": 1.16287279, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.09148145427830927, + "language_loss": 0.89981961, + "learning_rate": 0.0009788023872308875, + "loss": 0.9116869, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.23852539, + "step": 625, + "time_per_iteration": 2.5552427768707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073346, + "balance_loss_mlp": 1.05656123, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.03421346211042783, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76502347, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.16796875, + "step": 626, + "time_per_iteration": 4.845045804977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152939, + "balance_loss_mlp": 1.12972903, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.16289185985396562, + "language_loss": 0.93840104, + "learning_rate": 0.0009786225140303285, + "loss": 0.94993043, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.23217773, + "step": 627, + "time_per_iteration": 2.697042465209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167636, + "balance_loss_mlp": 1.14417565, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.2209026580633741, + "language_loss": 0.91874695, + "learning_rate": 0.0009785322986859634, + "loss": 0.93042338, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.23461914, + "step": 628, + "time_per_iteration": 2.6944122314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153297, + "balance_loss_mlp": 1.12997985, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.07492335946827373, + "language_loss": 0.92751127, + "learning_rate": 0.0009784418975588838, + "loss": 0.93904424, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.23303223, + "step": 629, + "time_per_iteration": 2.7154979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156607, + "balance_loss_mlp": 1.1338973, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.07449655700779013, + "language_loss": 0.9307186, + "learning_rate": 0.0009783513106841862, + "loss": 0.9422847, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22717285, + "step": 630, + "time_per_iteration": 2.704155921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078203, + "balance_loss_mlp": 1.06208599, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.048222043628353826, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77810907, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.16113281, + "step": 631, + "time_per_iteration": 4.9827399253845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186068, + "balance_loss_mlp": 1.16259575, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.0695579405445101, + "language_loss": 0.87454391, + "learning_rate": 0.0009781695798326854, + "loss": 0.88640457, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.23474121, + "step": 632, + "time_per_iteration": 2.6077868938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119723, + "balance_loss_mlp": 1.17401958, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.0874974071775435, + "language_loss": 0.87916714, + "learning_rate": 0.0009780784359264365, + "loss": 0.89113945, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.23205566, + "step": 633, + "time_per_iteration": 2.6383118629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040346, + "balance_loss_mlp": 1.02403784, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.031225790586482303, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75229043, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.16308594, + "step": 634, + "time_per_iteration": 4.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217333, + "balance_loss_mlp": 1.19409907, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.07796234580729426, + "language_loss": 0.8718015, + "learning_rate": 0.000977895591329867, + "loss": 0.88397485, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.23205566, + "step": 635, + "time_per_iteration": 2.803107976913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234374, + "balance_loss_mlp": 1.21001959, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.11392323325170377, + "language_loss": 0.86567664, + "learning_rate": 0.000977803890710533, + "loss": 0.87802041, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.24304199, + "step": 636, + "time_per_iteration": 2.751648187637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120141, + "balance_loss_mlp": 1.17864108, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.07701221180236865, + "language_loss": 0.93102324, + "learning_rate": 0.0009777120045912774, + "loss": 0.94303727, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.2277832, + "step": 637, + "time_per_iteration": 2.691467761993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186061, + "balance_loss_mlp": 1.16312516, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.08871868954386787, + "language_loss": 0.89725113, + "learning_rate": 0.0009776199330077736, + "loss": 0.90911174, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.22924805, + "step": 638, + "time_per_iteration": 2.7779197692871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117449, + "balance_loss_mlp": 1.15229297, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08051745841053916, + "language_loss": 0.91847914, + "learning_rate": 0.0009775276759957667, + "loss": 0.93022406, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.2220459, + "step": 639, + "time_per_iteration": 2.8452744483947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170739, + "balance_loss_mlp": 1.14792228, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.0993028160053512, + "language_loss": 0.89413661, + "learning_rate": 0.0009774352335910745, + "loss": 0.90584403, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.22814941, + "step": 640, + "time_per_iteration": 2.8268258571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011491, + "balance_loss_mlp": 1.12753499, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08449801570349542, + "language_loss": 0.9440136, + "learning_rate": 0.000977342605829586, + "loss": 0.9555046, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.21569824, + "step": 641, + "time_per_iteration": 2.7570323944091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162286, + "balance_loss_mlp": 1.13913512, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.11072842132379487, + "language_loss": 0.85702711, + "learning_rate": 0.0009772497927472623, + "loss": 0.86864996, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.23144531, + "step": 642, + "time_per_iteration": 3.1265273094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165192, + "balance_loss_mlp": 1.14213657, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.12556940690050455, + "language_loss": 0.84848756, + "learning_rate": 0.0009771567943801368, + "loss": 0.86013943, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.23034668, + "step": 643, + "time_per_iteration": 2.652181386947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160262, + "balance_loss_mlp": 1.13739729, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.08337524575338892, + "language_loss": 0.8885237, + "learning_rate": 0.0009770636107643152, + "loss": 0.90012634, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.22851562, + "step": 644, + "time_per_iteration": 2.7387216091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165702, + "balance_loss_mlp": 1.14195597, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.19339175735102193, + "language_loss": 0.86818463, + "learning_rate": 0.0009769702419359738, + "loss": 0.87984169, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.23730469, + "step": 645, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173541, + "balance_loss_mlp": 1.15027177, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.07743854144019968, + "language_loss": 0.88816965, + "learning_rate": 0.000976876687931362, + "loss": 0.89990509, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.23254395, + "step": 646, + "time_per_iteration": 3.0269463062286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143309, + "balance_loss_mlp": 1.1195029, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.09200303883175577, + "language_loss": 0.84307587, + "learning_rate": 0.0009767829487868005, + "loss": 0.85450894, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.23791504, + "step": 647, + "time_per_iteration": 2.652456045150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136141, + "balance_loss_mlp": 1.11240613, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.0914449303971137, + "language_loss": 0.88396645, + "learning_rate": 0.000976689024538682, + "loss": 0.89532787, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.23718262, + "step": 648, + "time_per_iteration": 2.66267466545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114222, + "balance_loss_mlp": 1.11798477, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.0994157560321478, + "language_loss": 0.86652195, + "learning_rate": 0.0009765949152234716, + "loss": 0.87794411, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.2421875, + "step": 649, + "time_per_iteration": 2.9676578044891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130303, + "balance_loss_mlp": 1.11628377, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.046775068167293626, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79816383, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.140625, + "step": 650, + "time_per_iteration": 4.760566711425781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117959, + "balance_loss_mlp": 1.1559155, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09210474588463947, + "language_loss": 0.813963, + "learning_rate": 0.0009764061415379919, + "loss": 0.82575887, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.2364502, + "step": 651, + "time_per_iteration": 3.3511757850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120244, + "balance_loss_mlp": 1.17746568, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.09212981752556385, + "language_loss": 0.87756586, + "learning_rate": 0.0009763114772410109, + "loss": 0.88959026, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.24975586, + "step": 652, + "time_per_iteration": 2.5980827808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224224, + "balance_loss_mlp": 1.20058513, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.08737716532166849, + "language_loss": 0.86069119, + "learning_rate": 0.0009762166280235146, + "loss": 0.87293345, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.23632812, + "step": 653, + "time_per_iteration": 2.9842958450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232055, + "balance_loss_mlp": 1.2083323, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.10849525216708464, + "language_loss": 0.86920303, + "learning_rate": 0.0009761215939223267, + "loss": 0.88152361, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.23706055, + "step": 654, + "time_per_iteration": 2.741058349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120932, + "balance_loss_mlp": 1.18547845, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.12794458644218995, + "language_loss": 0.85666406, + "learning_rate": 0.0009760263749743428, + "loss": 0.86875725, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.23828125, + "step": 655, + "time_per_iteration": 2.5808663368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180222, + "balance_loss_mlp": 1.15707195, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.095199105706819, + "language_loss": 0.89238775, + "learning_rate": 0.0009759309712165299, + "loss": 0.90418994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.23144531, + "step": 656, + "time_per_iteration": 2.748532295227051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181081, + "balance_loss_mlp": 1.15800261, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.10916020635653645, + "language_loss": 0.9220295, + "learning_rate": 0.0009758353826859272, + "loss": 0.93384039, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.23071289, + "step": 657, + "time_per_iteration": 2.595853805541992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177185, + "balance_loss_mlp": 1.15273547, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.12847037355320456, + "language_loss": 0.87952709, + "learning_rate": 0.0009757396094196456, + "loss": 0.89129901, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.24438477, + "step": 658, + "time_per_iteration": 2.8620266914367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203673, + "balance_loss_mlp": 1.17950892, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.07321655622824354, + "language_loss": 0.83431864, + "learning_rate": 0.0009756436514548673, + "loss": 0.84635538, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.24169922, + "step": 659, + "time_per_iteration": 2.912091016769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217457, + "balance_loss_mlp": 1.19229198, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.10055529179538837, + "language_loss": 0.8726669, + "learning_rate": 0.0009755475088288466, + "loss": 0.88484144, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.25183105, + "step": 660, + "time_per_iteration": 2.781341075897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243827, + "balance_loss_mlp": 1.218292, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.1174457122427187, + "language_loss": 0.88868487, + "learning_rate": 0.0009754511815789095, + "loss": 0.90112311, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.25537109, + "step": 661, + "time_per_iteration": 2.8132684230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246052, + "balance_loss_mlp": 1.21920574, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.09745592985886121, + "language_loss": 0.8455224, + "learning_rate": 0.0009753546697424533, + "loss": 0.85798287, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.26904297, + "step": 662, + "time_per_iteration": 2.7095847129821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243151, + "balance_loss_mlp": 1.21792674, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.12502287201474796, + "language_loss": 0.89571029, + "learning_rate": 0.0009752579733569475, + "loss": 0.90814179, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.25244141, + "step": 663, + "time_per_iteration": 2.6534910202026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119538, + "balance_loss_mlp": 1.17935824, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.048799046747725165, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7607677, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.16015625, + "step": 664, + "time_per_iteration": 4.974175453186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218811, + "balance_loss_mlp": 1.19439721, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.1143586633045421, + "language_loss": 0.88993388, + "learning_rate": 0.0009750640270890217, + "loss": 0.90212196, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.24401855, + "step": 665, + "time_per_iteration": 2.7663521766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124539, + "balance_loss_mlp": 1.22150016, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.11930184932546978, + "language_loss": 0.94833052, + "learning_rate": 0.0009749667772818983, + "loss": 0.96078444, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.23876953, + "step": 666, + "time_per_iteration": 3.01556134223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_mlp": 1.10473776, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.04410313188129877, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78056413, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.15722656, + "step": 667, + "time_per_iteration": 4.865432262420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226701, + "balance_loss_mlp": 1.20370543, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.11041987280926156, + "language_loss": 0.94443977, + "learning_rate": 0.0009747717245101093, + "loss": 0.95670676, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.22998047, + "step": 668, + "time_per_iteration": 2.564667224884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217641, + "balance_loss_mlp": 1.19444275, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0905963820135437, + "language_loss": 0.84166789, + "learning_rate": 0.00097467392162117, + "loss": 0.85384434, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.23193359, + "step": 669, + "time_per_iteration": 2.625565528869629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218894, + "balance_loss_mlp": 1.19641066, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.07707390480747152, + "language_loss": 0.90709603, + "learning_rate": 0.0009745759344474708, + "loss": 0.919285, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.22485352, + "step": 670, + "time_per_iteration": 2.828810691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210198, + "balance_loss_mlp": 1.18807316, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.1296418275033253, + "language_loss": 0.88266867, + "learning_rate": 0.0009744777630270536, + "loss": 0.89477074, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.22119141, + "step": 671, + "time_per_iteration": 2.5931460857391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205012, + "balance_loss_mlp": 1.18351889, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.16263031414063664, + "language_loss": 0.92705458, + "learning_rate": 0.000974379407398032, + "loss": 0.93910474, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.21508789, + "step": 672, + "time_per_iteration": 2.947148323059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208232, + "balance_loss_mlp": 1.18665552, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09135110996657969, + "language_loss": 0.81593442, + "learning_rate": 0.0009742808675985913, + "loss": 0.82801676, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21594238, + "step": 673, + "time_per_iteration": 3.179880380630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223794, + "balance_loss_mlp": 1.20184779, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08798796705409132, + "language_loss": 0.89740491, + "learning_rate": 0.0009741821436669876, + "loss": 0.90964288, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.21948242, + "step": 674, + "time_per_iteration": 2.5925161838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230582, + "balance_loss_mlp": 1.20812273, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.13739173158435178, + "language_loss": 0.91820276, + "learning_rate": 0.0009740832356415492, + "loss": 0.93050855, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22473145, + "step": 675, + "time_per_iteration": 2.5184531211853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223494, + "balance_loss_mlp": 1.20120144, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.10341661200692882, + "language_loss": 0.87010336, + "learning_rate": 0.0009739841435606756, + "loss": 0.88233835, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22290039, + "step": 676, + "time_per_iteration": 3.0507655143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207183, + "balance_loss_mlp": 1.18511748, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.08057490768106465, + "language_loss": 0.89111441, + "learning_rate": 0.0009738848674628377, + "loss": 0.90318626, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.2208252, + "step": 677, + "time_per_iteration": 2.745363235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121617, + "balance_loss_mlp": 1.19430709, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.0856975246411629, + "language_loss": 0.88498092, + "learning_rate": 0.000973785407386578, + "loss": 0.89714259, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.21862793, + "step": 678, + "time_per_iteration": 2.778620958328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214606, + "balance_loss_mlp": 1.191324, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.06828211935324495, + "language_loss": 0.86676407, + "learning_rate": 0.0009736857633705103, + "loss": 0.87891012, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.23266602, + "step": 679, + "time_per_iteration": 2.9231183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209443, + "balance_loss_mlp": 1.18695986, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.0834800111741461, + "language_loss": 0.92100477, + "learning_rate": 0.0009735859354533196, + "loss": 0.93309915, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.22460938, + "step": 680, + "time_per_iteration": 2.775928258895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195854, + "balance_loss_mlp": 1.17248893, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.10927888529927046, + "language_loss": 0.91257143, + "learning_rate": 0.0009734859236737628, + "loss": 0.92453003, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.23339844, + "step": 681, + "time_per_iteration": 2.684873342514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171844, + "balance_loss_mlp": 1.1486578, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.1264690256888091, + "language_loss": 0.92692226, + "learning_rate": 0.0009733857280706678, + "loss": 0.93864071, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.23168945, + "step": 682, + "time_per_iteration": 2.6460657119750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174213, + "balance_loss_mlp": 1.15156293, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.10018713039090629, + "language_loss": 0.83565485, + "learning_rate": 0.000973285348682934, + "loss": 0.84739697, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.2265625, + "step": 683, + "time_per_iteration": 2.758242607116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.13504481, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.05076292773380049, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79046488, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.13085938, + "step": 684, + "time_per_iteration": 4.8192243576049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204502, + "balance_loss_mlp": 1.17974257, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.1066751932733185, + "language_loss": 0.84567851, + "learning_rate": 0.0009730840387095046, + "loss": 0.85772359, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.24768066, + "step": 685, + "time_per_iteration": 3.3115832805633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227082, + "balance_loss_mlp": 1.20198846, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.07078465407426249, + "language_loss": 0.90421009, + "learning_rate": 0.0009729831082019642, + "loss": 0.9164809, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.25097656, + "step": 686, + "time_per_iteration": 2.8678879737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252204, + "balance_loss_mlp": 1.22750425, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.09776828955155538, + "language_loss": 0.8801111, + "learning_rate": 0.0009728819940660958, + "loss": 0.89263314, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.24707031, + "step": 687, + "time_per_iteration": 2.7938969135284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263825, + "balance_loss_mlp": 1.23863578, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.10048379585191887, + "language_loss": 0.84283459, + "learning_rate": 0.0009727806963411557, + "loss": 0.8554728, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.25195312, + "step": 688, + "time_per_iteration": 2.607588529586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239051, + "balance_loss_mlp": 1.2133261, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.08603068006049115, + "language_loss": 0.8672629, + "learning_rate": 0.000972679215066471, + "loss": 0.87965345, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.25756836, + "step": 689, + "time_per_iteration": 2.7422516345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224087, + "balance_loss_mlp": 1.19882667, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.10287089436887557, + "language_loss": 0.9870705, + "learning_rate": 0.0009725775502814401, + "loss": 0.99931133, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.25268555, + "step": 690, + "time_per_iteration": 2.5919952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192013, + "balance_loss_mlp": 1.16732466, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.1091756570575493, + "language_loss": 0.84613961, + "learning_rate": 0.0009724757020255327, + "loss": 0.85805976, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.2467041, + "step": 691, + "time_per_iteration": 2.851348400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011849, + "balance_loss_mlp": 1.15994906, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.0968423296469171, + "language_loss": 0.86866987, + "learning_rate": 0.0009723736703382902, + "loss": 0.88051891, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.24951172, + "step": 692, + "time_per_iteration": 2.5881834030151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179974, + "balance_loss_mlp": 1.15652537, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.10463911515585092, + "language_loss": 0.82742584, + "learning_rate": 0.0009722714552593244, + "loss": 0.83922553, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.23413086, + "step": 693, + "time_per_iteration": 2.6343894004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186752, + "balance_loss_mlp": 1.16344643, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.12210775976205426, + "language_loss": 0.93531036, + "learning_rate": 0.000972169056828319, + "loss": 0.94717789, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.23303223, + "step": 694, + "time_per_iteration": 2.4834342002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183255, + "balance_loss_mlp": 1.16046166, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.08175934073664855, + "language_loss": 0.87263072, + "learning_rate": 0.0009720664750850283, + "loss": 0.88446331, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.22790527, + "step": 695, + "time_per_iteration": 2.796005964279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191523, + "balance_loss_mlp": 1.16836047, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.0918947132133249, + "language_loss": 0.92442453, + "learning_rate": 0.0009719637100692784, + "loss": 0.9363398, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.23168945, + "step": 696, + "time_per_iteration": 2.7338545322418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173282, + "balance_loss_mlp": 1.15093064, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.09425790223988205, + "language_loss": 0.82822204, + "learning_rate": 0.0009718607618209661, + "loss": 0.83995485, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.22351074, + "step": 697, + "time_per_iteration": 2.8834567070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167823, + "balance_loss_mlp": 1.14468443, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07380520807835853, + "language_loss": 0.87331033, + "learning_rate": 0.0009717576303800595, + "loss": 0.88498855, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.23120117, + "step": 698, + "time_per_iteration": 3.0662593841552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189182, + "balance_loss_mlp": 1.1649704, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.08733354578890483, + "language_loss": 0.85059655, + "learning_rate": 0.0009716543157865975, + "loss": 0.86248839, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.24182129, + "step": 699, + "time_per_iteration": 2.7156968116760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210396, + "balance_loss_mlp": 1.1879611, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.08759306221047211, + "language_loss": 0.82954025, + "learning_rate": 0.0009715508180806907, + "loss": 0.84164423, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.22436523, + "step": 700, + "time_per_iteration": 3.204936981201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209609, + "balance_loss_mlp": 1.18669748, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07843453256975112, + "language_loss": 0.89359999, + "learning_rate": 0.0009714471373025202, + "loss": 0.90569609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.22900391, + "step": 701, + "time_per_iteration": 3.4600374698638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186239, + "balance_loss_mlp": 1.16323161, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07505390512906053, + "language_loss": 0.88395512, + "learning_rate": 0.0009713432734923386, + "loss": 0.89581752, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.23010254, + "step": 702, + "time_per_iteration": 2.638005018234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173659, + "balance_loss_mlp": 1.15109301, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09376344684626736, + "language_loss": 0.86520576, + "learning_rate": 0.0009712392266904696, + "loss": 0.8769424, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.22558594, + "step": 703, + "time_per_iteration": 2.7503063678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116976, + "balance_loss_mlp": 1.14838624, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.18430775331568308, + "language_loss": 0.85049546, + "learning_rate": 0.0009711349969373076, + "loss": 0.86219305, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.21386719, + "step": 704, + "time_per_iteration": 3.1815178394317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166202, + "balance_loss_mlp": 1.14376664, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.08099598593900344, + "language_loss": 0.80275941, + "learning_rate": 0.0009710305842733178, + "loss": 0.81442142, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.22436523, + "step": 705, + "time_per_iteration": 2.7307353019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.13138402, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.08979940018627898, + "language_loss": 0.89208561, + "learning_rate": 0.0009709259887390373, + "loss": 0.90360606, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.20666504, + "step": 706, + "time_per_iteration": 2.6135804653167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_mlp": 1.13901603, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.11609486524108804, + "language_loss": 0.9066751, + "learning_rate": 0.0009708212103750737, + "loss": 0.91828114, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.21606445, + "step": 707, + "time_per_iteration": 2.632742166519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185341, + "balance_loss_mlp": 1.16383576, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.10488018026765993, + "language_loss": 0.86886567, + "learning_rate": 0.0009707162492221051, + "loss": 0.88071907, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.21508789, + "step": 708, + "time_per_iteration": 2.9155325889587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221514, + "balance_loss_mlp": 1.19948387, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.11565397704484869, + "language_loss": 0.87553132, + "learning_rate": 0.0009706111053208815, + "loss": 0.88774645, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.22058105, + "step": 709, + "time_per_iteration": 2.843981981277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233371, + "balance_loss_mlp": 1.21016061, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10007182380605975, + "language_loss": 0.85645008, + "learning_rate": 0.0009705057787122232, + "loss": 0.86878371, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.23193359, + "step": 710, + "time_per_iteration": 2.594890832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195501, + "balance_loss_mlp": 1.17281508, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.08836610284298578, + "language_loss": 0.90505099, + "learning_rate": 0.0009704002694370216, + "loss": 0.91700602, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.22680664, + "step": 711, + "time_per_iteration": 2.5702362060546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117145, + "balance_loss_mlp": 1.14863288, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.11670756159189942, + "language_loss": 0.86028767, + "learning_rate": 0.0009702945775362388, + "loss": 0.87200224, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.22802734, + "step": 712, + "time_per_iteration": 2.6679470539093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149984, + "balance_loss_mlp": 1.12776387, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.10271253203083616, + "language_loss": 0.86890107, + "learning_rate": 0.0009701887030509086, + "loss": 0.8804009, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.22229004, + "step": 713, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112596, + "balance_loss_mlp": 1.1041683, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.09375417211048337, + "language_loss": 0.90942538, + "learning_rate": 0.0009700826460221346, + "loss": 0.92068493, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.21801758, + "step": 714, + "time_per_iteration": 2.7277417182922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133315, + "balance_loss_mlp": 1.11104631, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.1250628990201497, + "language_loss": 0.92436254, + "learning_rate": 0.0009699764064910921, + "loss": 0.93569565, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.22265625, + "step": 715, + "time_per_iteration": 2.900053024291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.10697007, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.18348866981846054, + "language_loss": 0.86833155, + "learning_rate": 0.0009698699844990268, + "loss": 0.87962508, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.22387695, + "step": 716, + "time_per_iteration": 2.645792245864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136392, + "balance_loss_mlp": 1.11483872, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.08476879745046602, + "language_loss": 0.87948525, + "learning_rate": 0.0009697633800872555, + "loss": 0.89084923, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.21557617, + "step": 717, + "time_per_iteration": 2.9197771549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153248, + "balance_loss_mlp": 1.13183844, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.08051298122060387, + "language_loss": 0.90472651, + "learning_rate": 0.0009696565932971655, + "loss": 0.91625893, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.2142334, + "step": 718, + "time_per_iteration": 2.9118661880493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157354, + "balance_loss_mlp": 1.1350143, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.09173992406124648, + "language_loss": 0.897349, + "learning_rate": 0.0009695496241702153, + "loss": 0.90892255, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.22338867, + "step": 719, + "time_per_iteration": 2.8108739852905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184496, + "balance_loss_mlp": 1.16145301, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.09716749239115424, + "language_loss": 0.85599422, + "learning_rate": 0.0009694424727479339, + "loss": 0.86783922, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.23034668, + "step": 720, + "time_per_iteration": 2.9078242778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190722, + "balance_loss_mlp": 1.16825104, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.08276113558291018, + "language_loss": 0.88687241, + "learning_rate": 0.0009693351390719213, + "loss": 0.89877963, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.22473145, + "step": 721, + "time_per_iteration": 2.727829933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214778, + "balance_loss_mlp": 1.19178224, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.08055125516722848, + "language_loss": 0.9053812, + "learning_rate": 0.000969227623183848, + "loss": 0.91752893, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.22998047, + "step": 722, + "time_per_iteration": 2.8233954906463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202462, + "balance_loss_mlp": 1.17980003, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06957111358845897, + "language_loss": 0.90902817, + "learning_rate": 0.0009691199251254554, + "loss": 0.92105281, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.22668457, + "step": 723, + "time_per_iteration": 2.838449001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188398, + "balance_loss_mlp": 1.16651106, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.08029384244148012, + "language_loss": 0.86382651, + "learning_rate": 0.0009690120449385555, + "loss": 0.87571049, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.21899414, + "step": 724, + "time_per_iteration": 2.7877347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191299, + "balance_loss_mlp": 1.16917384, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.113442270614728, + "language_loss": 0.92300928, + "learning_rate": 0.0009689039826650312, + "loss": 0.93492222, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.22131348, + "step": 725, + "time_per_iteration": 2.8086507320404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219435, + "balance_loss_mlp": 1.20293677, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.07583456833656638, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77742493, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.16503906, + "step": 726, + "time_per_iteration": 4.891220331192017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177839, + "balance_loss_mlp": 1.15583265, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.4935698294407845, + "language_loss": 0.86680418, + "learning_rate": 0.0009686873120259941, + "loss": 0.8785826, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.22021484, + "step": 727, + "time_per_iteration": 2.584016799926758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220836, + "balance_loss_mlp": 1.19853175, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.12530325225106098, + "language_loss": 0.86788189, + "learning_rate": 0.0009685787037446004, + "loss": 0.88009018, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.22314453, + "step": 728, + "time_per_iteration": 2.7812938690185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256231, + "balance_loss_mlp": 1.2321384, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.19184429152401888, + "language_loss": 0.86789989, + "learning_rate": 0.0009684699135448201, + "loss": 0.88046223, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.24072266, + "step": 729, + "time_per_iteration": 2.7354156970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316519, + "balance_loss_mlp": 1.29105544, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.08142335105414879, + "language_loss": 0.91990757, + "learning_rate": 0.0009683609414688895, + "loss": 0.93307269, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.25463867, + "step": 730, + "time_per_iteration": 2.7542572021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396811, + "balance_loss_mlp": 1.36896372, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.09882218945586521, + "language_loss": 0.86064744, + "learning_rate": 0.0009682517875591154, + "loss": 0.87461555, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.27856445, + "step": 731, + "time_per_iteration": 2.7971835136413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440007, + "balance_loss_mlp": 1.41070533, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.11775580833602758, + "language_loss": 0.85772473, + "learning_rate": 0.0009681424518578749, + "loss": 0.87212479, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.29248047, + "step": 732, + "time_per_iteration": 2.742525100708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460191, + "balance_loss_mlp": 1.43045998, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.14540211876107528, + "language_loss": 0.87523216, + "learning_rate": 0.000968032934407616, + "loss": 0.88983405, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.296875, + "step": 733, + "time_per_iteration": 2.586650848388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01389602, + "balance_loss_mlp": 1.35989547, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.09505428174523772, + "language_loss": 0.81872886, + "learning_rate": 0.0009679232352508571, + "loss": 0.83262491, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.29711914, + "step": 734, + "time_per_iteration": 2.8065295219421387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337262, + "balance_loss_mlp": 1.30776978, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.08594729931011787, + "language_loss": 0.8053807, + "learning_rate": 0.0009678133544301871, + "loss": 0.8187533, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.29492188, + "step": 735, + "time_per_iteration": 2.681156635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_mlp": 1.26231337, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.07917660118228964, + "language_loss": 0.91284931, + "learning_rate": 0.0009677032919882658, + "loss": 0.92575711, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.28442383, + "step": 736, + "time_per_iteration": 2.701876163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_mlp": 1.2393055, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.11161795715290385, + "language_loss": 0.91632634, + "learning_rate": 0.000967593047967823, + "loss": 0.92899764, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.27832031, + "step": 737, + "time_per_iteration": 2.549489736557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257034, + "balance_loss_mlp": 1.22987819, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.11515852654264594, + "language_loss": 0.86905932, + "learning_rate": 0.0009674826224116593, + "loss": 0.88162971, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.27160645, + "step": 738, + "time_per_iteration": 2.8459107875823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01274254, + "balance_loss_mlp": 1.24875474, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.086163857469945, + "language_loss": 0.8627907, + "learning_rate": 0.0009673720153626455, + "loss": 0.87553322, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.25512695, + "step": 739, + "time_per_iteration": 2.6033051013946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01298128, + "balance_loss_mlp": 1.27345169, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07922284002741106, + "language_loss": 0.8672145, + "learning_rate": 0.0009672612268637235, + "loss": 0.88019574, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.2467041, + "step": 740, + "time_per_iteration": 2.639249801635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331294, + "balance_loss_mlp": 1.30575967, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.09083563941739939, + "language_loss": 0.84015429, + "learning_rate": 0.0009671502569579048, + "loss": 0.85346723, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.25537109, + "step": 741, + "time_per_iteration": 2.784358263015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372438, + "balance_loss_mlp": 1.34778547, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.08785871424370759, + "language_loss": 0.89829892, + "learning_rate": 0.0009670391056882719, + "loss": 0.91202337, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.2467041, + "step": 742, + "time_per_iteration": 2.765284299850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384139, + "balance_loss_mlp": 1.35946321, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.09890816943940939, + "language_loss": 0.88263386, + "learning_rate": 0.0009669277730979776, + "loss": 0.89647526, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.24694824, + "step": 743, + "time_per_iteration": 3.2124171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409259, + "balance_loss_mlp": 1.38365269, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.08939291923456745, + "language_loss": 0.85339808, + "learning_rate": 0.0009668162592302449, + "loss": 0.86749065, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.25610352, + "step": 744, + "time_per_iteration": 2.947239398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01413521, + "balance_loss_mlp": 1.38784337, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.12486964956779355, + "language_loss": 0.86141676, + "learning_rate": 0.0009667045641283676, + "loss": 0.87555194, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.25708008, + "step": 745, + "time_per_iteration": 2.67399001121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345232, + "balance_loss_mlp": 1.32049656, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09833561966825685, + "language_loss": 0.94721901, + "learning_rate": 0.0009665926878357092, + "loss": 0.96067131, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.24743652, + "step": 746, + "time_per_iteration": 2.951524257659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308325, + "balance_loss_mlp": 1.28470945, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.09374380516730212, + "language_loss": 0.90804815, + "learning_rate": 0.0009664806303957043, + "loss": 0.92113143, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.23608398, + "step": 747, + "time_per_iteration": 2.7018370628356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290979, + "balance_loss_mlp": 1.26711321, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.09976705309421963, + "language_loss": 0.87274301, + "learning_rate": 0.0009663683918518571, + "loss": 0.88565284, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.23840332, + "step": 748, + "time_per_iteration": 2.9669973850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260109, + "balance_loss_mlp": 1.23742342, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.09601459473111058, + "language_loss": 0.85172814, + "learning_rate": 0.0009662559722477428, + "loss": 0.86432928, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.22680664, + "step": 749, + "time_per_iteration": 2.692737579345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01313989, + "balance_loss_mlp": 1.2952019, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.07630612016334831, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77476966, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.1875, + "step": 750, + "time_per_iteration": 5.012727975845337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203648, + "balance_loss_mlp": 1.18093836, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.10872642357348963, + "language_loss": 0.88863885, + "learning_rate": 0.0009660305900333632, + "loss": 0.90067536, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.22705078, + "step": 751, + "time_per_iteration": 2.715942859649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173383, + "balance_loss_mlp": 1.15045881, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08046883529286915, + "language_loss": 0.82496673, + "learning_rate": 0.0009659176275105992, + "loss": 0.83670056, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.22924805, + "step": 752, + "time_per_iteration": 2.713360071182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.15698361, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.07494959784402849, + "language_loss": 0.85518491, + "learning_rate": 0.0009658044841025701, + "loss": 0.86698937, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.23425293, + "step": 753, + "time_per_iteration": 2.7982797622680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117179, + "balance_loss_mlp": 1.14774585, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.10908868033385523, + "language_loss": 0.81575012, + "learning_rate": 0.0009656911598532021, + "loss": 0.82746804, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.24023438, + "step": 754, + "time_per_iteration": 2.642843246459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216386, + "balance_loss_mlp": 1.19192445, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.08024204468384731, + "language_loss": 0.89968902, + "learning_rate": 0.0009655776548064917, + "loss": 0.91185284, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.24462891, + "step": 755, + "time_per_iteration": 2.6598751544952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240024, + "balance_loss_mlp": 1.2152878, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.0778788297064716, + "language_loss": 0.88022745, + "learning_rate": 0.0009654639690065054, + "loss": 0.89262772, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.24743652, + "step": 756, + "time_per_iteration": 2.8861637115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126465, + "balance_loss_mlp": 1.2393297, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.09020306103656467, + "language_loss": 0.87895447, + "learning_rate": 0.00096535010249738, + "loss": 0.89160097, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.25317383, + "step": 757, + "time_per_iteration": 2.7438864707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270837, + "balance_loss_mlp": 1.24456334, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.12633601395220453, + "language_loss": 0.82038969, + "learning_rate": 0.0009652360553233224, + "loss": 0.83309805, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.26318359, + "step": 758, + "time_per_iteration": 2.7446844577789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210641, + "balance_loss_mlp": 1.18994594, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.05582061662785393, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7498439, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.20703125, + "step": 759, + "time_per_iteration": 4.942702054977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212546, + "balance_loss_mlp": 1.18641555, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.06567012775246582, + "language_loss": 0.81178761, + "learning_rate": 0.0009650074191575883, + "loss": 0.8239131, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.26171875, + "step": 760, + "time_per_iteration": 3.2085912227630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198598, + "balance_loss_mlp": 1.17261064, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07877672537318793, + "language_loss": 0.85659027, + "learning_rate": 0.0009648928302546766, + "loss": 0.86857623, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.26013184, + "step": 761, + "time_per_iteration": 2.7206709384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176065, + "balance_loss_mlp": 1.15095961, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.07899561323852963, + "language_loss": 0.85068321, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244392, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.25109863, + "step": 762, + "time_per_iteration": 3.4438586235046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170525, + "balance_loss_mlp": 1.14620686, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.141987740723005, + "language_loss": 0.87758678, + "learning_rate": 0.0009646631110312001, + "loss": 0.88929206, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.24304199, + "step": 763, + "time_per_iteration": 2.6546902656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152908, + "balance_loss_mlp": 1.12836289, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.07748728130668867, + "language_loss": 0.88344562, + "learning_rate": 0.0009645479807998203, + "loss": 0.89497471, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.2454834, + "step": 764, + "time_per_iteration": 2.7865586280822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149811, + "balance_loss_mlp": 1.12623131, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.07163260805176828, + "language_loss": 0.92376024, + "learning_rate": 0.0009644326702149196, + "loss": 0.93525833, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.23571777, + "step": 765, + "time_per_iteration": 2.729707717895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114598, + "balance_loss_mlp": 1.12176871, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.10016890685610987, + "language_loss": 0.84570462, + "learning_rate": 0.0009643171793212653, + "loss": 0.85716444, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.24206543, + "step": 766, + "time_per_iteration": 3.104130983352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147763, + "balance_loss_mlp": 1.12331319, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.0994818648660217, + "language_loss": 0.88828337, + "learning_rate": 0.0009642015081636952, + "loss": 0.89976102, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.24438477, + "step": 767, + "time_per_iteration": 2.6991779804229736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118291, + "balance_loss_mlp": 1.15871024, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.10983776315132832, + "language_loss": 0.87698913, + "learning_rate": 0.0009640856567871166, + "loss": 0.8888182, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.24182129, + "step": 768, + "time_per_iteration": 2.5240631103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212502, + "balance_loss_mlp": 1.18818331, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07387168528771362, + "language_loss": 0.88451684, + "learning_rate": 0.0009639696252365072, + "loss": 0.89664185, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.24304199, + "step": 769, + "time_per_iteration": 3.0557117462158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239197, + "balance_loss_mlp": 1.21551013, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.09914913961127292, + "language_loss": 0.8159318, + "learning_rate": 0.0009638534135569144, + "loss": 0.82832372, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.23657227, + "step": 770, + "time_per_iteration": 2.9298524856567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245135, + "balance_loss_mlp": 1.22161531, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.09866814803314855, + "language_loss": 0.89646047, + "learning_rate": 0.0009637370217934554, + "loss": 0.90891182, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.23498535, + "step": 771, + "time_per_iteration": 2.682309865951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221897, + "balance_loss_mlp": 1.19855595, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06824551266768007, + "language_loss": 0.83023787, + "learning_rate": 0.0009636204499913175, + "loss": 0.84245688, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.23327637, + "step": 772, + "time_per_iteration": 2.883767604827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223775, + "balance_loss_mlp": 1.20097065, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.07043844896966983, + "language_loss": 0.87725186, + "learning_rate": 0.0009635036981957581, + "loss": 0.88948965, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.22802734, + "step": 773, + "time_per_iteration": 2.9000537395477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187129, + "balance_loss_mlp": 1.16394269, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.15141860037933205, + "language_loss": 0.90646893, + "learning_rate": 0.0009633867664521043, + "loss": 0.91834021, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.23168945, + "step": 774, + "time_per_iteration": 2.8832309246063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169397, + "balance_loss_mlp": 1.14643705, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.08953509264873717, + "language_loss": 0.86451691, + "learning_rate": 0.0009632696548057527, + "loss": 0.87621093, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.22961426, + "step": 775, + "time_per_iteration": 2.5678458213806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114459, + "balance_loss_mlp": 1.12229764, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.10138358829207124, + "language_loss": 0.84634435, + "learning_rate": 0.0009631523633021704, + "loss": 0.85779023, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.22290039, + "step": 776, + "time_per_iteration": 2.8479549884796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_mlp": 1.10418677, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.10363335088159256, + "language_loss": 0.88188493, + "learning_rate": 0.0009630348919868936, + "loss": 0.89315504, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.22814941, + "step": 777, + "time_per_iteration": 2.7757747173309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136966, + "balance_loss_mlp": 1.11441135, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.09986786801472973, + "language_loss": 0.81042939, + "learning_rate": 0.0009629172409055293, + "loss": 0.82179904, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.22558594, + "step": 778, + "time_per_iteration": 2.5126540660858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145221, + "balance_loss_mlp": 1.12336957, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.09261188529780942, + "language_loss": 0.87480628, + "learning_rate": 0.0009627994101037531, + "loss": 0.88625842, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.21875, + "step": 779, + "time_per_iteration": 2.7716262340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115587, + "balance_loss_mlp": 1.13354254, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.08443086809005321, + "language_loss": 0.88840389, + "learning_rate": 0.0009626813996273114, + "loss": 0.8999626, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.22338867, + "step": 780, + "time_per_iteration": 2.8740992546081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186209, + "balance_loss_mlp": 1.16370249, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09833782575281567, + "language_loss": 0.88844621, + "learning_rate": 0.0009625632095220198, + "loss": 0.90030831, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.22497559, + "step": 781, + "time_per_iteration": 2.9050698280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204209, + "balance_loss_mlp": 1.18169069, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1242367807618526, + "language_loss": 0.87087309, + "learning_rate": 0.0009624448398337637, + "loss": 0.88291514, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.22509766, + "step": 782, + "time_per_iteration": 2.5470597743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227287, + "balance_loss_mlp": 1.20476806, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08884420814610612, + "language_loss": 0.8877629, + "learning_rate": 0.0009623262906084984, + "loss": 0.90003586, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.22521973, + "step": 783, + "time_per_iteration": 3.0006895065307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229008, + "balance_loss_mlp": 1.20682311, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.08808618298813263, + "language_loss": 0.8990804, + "learning_rate": 0.0009622075618922486, + "loss": 0.91137052, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.22192383, + "step": 784, + "time_per_iteration": 2.7111520767211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207095, + "balance_loss_mlp": 1.18568492, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.08652833198143661, + "language_loss": 0.87003136, + "learning_rate": 0.0009620886537311091, + "loss": 0.88210225, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.2142334, + "step": 785, + "time_per_iteration": 2.6401422023773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181477, + "balance_loss_mlp": 1.15950704, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.2899950143802249, + "language_loss": 0.85118186, + "learning_rate": 0.000961969566171244, + "loss": 0.8629967, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.21972656, + "step": 786, + "time_per_iteration": 2.526909351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196609, + "balance_loss_mlp": 1.17443573, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08121966250588863, + "language_loss": 0.90082663, + "learning_rate": 0.0009618502992588873, + "loss": 0.91279268, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.22167969, + "step": 787, + "time_per_iteration": 2.6575541496276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230806, + "balance_loss_mlp": 1.20764375, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.0715770490301525, + "language_loss": 0.87907356, + "learning_rate": 0.0009617308530403424, + "loss": 0.89138162, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.23168945, + "step": 788, + "time_per_iteration": 3.028930187225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258523, + "balance_loss_mlp": 1.23478842, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0802298351217653, + "language_loss": 0.87239158, + "learning_rate": 0.0009616112275619825, + "loss": 0.8849768, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.23718262, + "step": 789, + "time_per_iteration": 2.746056079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132077, + "balance_loss_mlp": 1.29635596, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.12648829262821384, + "language_loss": 0.83592963, + "learning_rate": 0.0009614914228702503, + "loss": 0.84913737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.24414062, + "step": 790, + "time_per_iteration": 2.6734559535980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308065, + "balance_loss_mlp": 1.28415179, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.09276885660597874, + "language_loss": 0.89010954, + "learning_rate": 0.0009613714390116581, + "loss": 0.9031902, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.23901367, + "step": 791, + "time_per_iteration": 2.983484983444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285002, + "balance_loss_mlp": 1.26071882, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07985140077311874, + "language_loss": 0.85613286, + "learning_rate": 0.0009612512760327879, + "loss": 0.86898291, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.24291992, + "step": 792, + "time_per_iteration": 2.883850336074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244556, + "balance_loss_mlp": 1.21998703, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09831690791880561, + "language_loss": 0.84491324, + "learning_rate": 0.0009611309339802909, + "loss": 0.85735881, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.24560547, + "step": 793, + "time_per_iteration": 2.4435439109802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207721, + "balance_loss_mlp": 1.1844871, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.0855298606279622, + "language_loss": 0.83781004, + "learning_rate": 0.0009610104129008881, + "loss": 0.84988725, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.23205566, + "step": 794, + "time_per_iteration": 3.13722825050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196195, + "balance_loss_mlp": 1.17304444, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.09863604959388503, + "language_loss": 0.88015008, + "learning_rate": 0.0009608897128413701, + "loss": 0.89211196, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.23132324, + "step": 795, + "time_per_iteration": 2.746291160583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176373, + "balance_loss_mlp": 1.15306783, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.059228494387600535, + "language_loss": 0.85641718, + "learning_rate": 0.0009607688338485965, + "loss": 0.86818099, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.23278809, + "step": 796, + "time_per_iteration": 2.8617959022521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152933, + "balance_loss_mlp": 1.12994909, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.11279362274359876, + "language_loss": 0.90298712, + "learning_rate": 0.0009606477759694969, + "loss": 0.91451651, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.22998047, + "step": 797, + "time_per_iteration": 3.054548978805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147506, + "balance_loss_mlp": 1.12495136, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.1240450491743707, + "language_loss": 0.87260056, + "learning_rate": 0.0009605265392510703, + "loss": 0.88407564, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.22546387, + "step": 798, + "time_per_iteration": 2.660917282104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164418, + "balance_loss_mlp": 1.14092219, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.07786555450456673, + "language_loss": 0.91656721, + "learning_rate": 0.0009604051237403846, + "loss": 0.92821133, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.23474121, + "step": 799, + "time_per_iteration": 2.6837708950042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189347, + "balance_loss_mlp": 1.16668534, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.09844042951466975, + "language_loss": 0.85933173, + "learning_rate": 0.0009602835294845776, + "loss": 0.87122524, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.2265625, + "step": 800, + "time_per_iteration": 2.4643006324768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201717, + "balance_loss_mlp": 1.17804241, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.08383413994751185, + "language_loss": 0.90000272, + "learning_rate": 0.0009601617565308565, + "loss": 0.91201991, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.23681641, + "step": 801, + "time_per_iteration": 2.6335196495056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211967, + "balance_loss_mlp": 1.18856657, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.11945007862770202, + "language_loss": 0.86351627, + "learning_rate": 0.0009600398049264977, + "loss": 0.87563592, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.23413086, + "step": 802, + "time_per_iteration": 3.0110597610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188433, + "balance_loss_mlp": 1.16469824, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.08697800210878956, + "language_loss": 0.9162643, + "learning_rate": 0.0009599176747188469, + "loss": 0.92814863, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.23718262, + "step": 803, + "time_per_iteration": 2.828881025314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169648, + "balance_loss_mlp": 1.14716554, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.09755966571473051, + "language_loss": 0.82901067, + "learning_rate": 0.0009597953659553196, + "loss": 0.84070712, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.22485352, + "step": 804, + "time_per_iteration": 2.744241952896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163065, + "balance_loss_mlp": 1.14110649, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08461871579014175, + "language_loss": 0.8877238, + "learning_rate": 0.0009596728786833997, + "loss": 0.89935452, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.21960449, + "step": 805, + "time_per_iteration": 2.637615203857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153079, + "balance_loss_mlp": 1.13075089, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.07567223700797457, + "language_loss": 0.89263672, + "learning_rate": 0.0009595502129506415, + "loss": 0.90416753, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.2232666, + "step": 806, + "time_per_iteration": 3.381657838821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157381, + "balance_loss_mlp": 1.13502955, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.08260081287011234, + "language_loss": 0.82411599, + "learning_rate": 0.0009594273688046678, + "loss": 0.8356899, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.22351074, + "step": 807, + "time_per_iteration": 2.7444403171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135063, + "balance_loss_mlp": 1.11292577, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.12637371348528909, + "language_loss": 0.85436296, + "learning_rate": 0.000959304346293171, + "loss": 0.8657136, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.22155762, + "step": 808, + "time_per_iteration": 2.630800485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138798, + "balance_loss_mlp": 1.11732841, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.1222248699411619, + "language_loss": 0.87775064, + "learning_rate": 0.0009591811454639125, + "loss": 0.8891387, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.21484375, + "step": 809, + "time_per_iteration": 2.7841880321502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140586, + "balance_loss_mlp": 1.11836529, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.0775612296994351, + "language_loss": 0.87793982, + "learning_rate": 0.0009590577663647234, + "loss": 0.88934565, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.22216797, + "step": 810, + "time_per_iteration": 2.7182021141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171751, + "balance_loss_mlp": 1.14905357, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.0958777530802899, + "language_loss": 0.85768712, + "learning_rate": 0.0009589342090435036, + "loss": 0.86940467, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.22692871, + "step": 811, + "time_per_iteration": 2.794064521789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186257, + "balance_loss_mlp": 1.16242695, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07937656910484732, + "language_loss": 0.86963636, + "learning_rate": 0.0009588104735482223, + "loss": 0.88149893, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.23803711, + "step": 812, + "time_per_iteration": 2.7221293449401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208738, + "balance_loss_mlp": 1.18419302, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.1117655096069856, + "language_loss": 0.83743179, + "learning_rate": 0.0009586865599269177, + "loss": 0.84951913, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.24536133, + "step": 813, + "time_per_iteration": 2.690633773803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238315, + "balance_loss_mlp": 1.21402001, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.10590050341373854, + "language_loss": 0.8774755, + "learning_rate": 0.0009585624682276977, + "loss": 0.88985866, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.24291992, + "step": 814, + "time_per_iteration": 2.756228446960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269022, + "balance_loss_mlp": 1.24407113, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.10996897761132594, + "language_loss": 0.87169892, + "learning_rate": 0.0009584381984987386, + "loss": 0.88438916, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.24938965, + "step": 815, + "time_per_iteration": 2.554874897003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264413, + "balance_loss_mlp": 1.23899746, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.08063052755401852, + "language_loss": 0.89821672, + "learning_rate": 0.0009583137507882864, + "loss": 0.91086084, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.25415039, + "step": 816, + "time_per_iteration": 2.667743444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249115, + "balance_loss_mlp": 1.22435474, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.09885575067946582, + "language_loss": 0.80580056, + "learning_rate": 0.000958189125144656, + "loss": 0.81829166, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.24768066, + "step": 817, + "time_per_iteration": 2.727062463760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.21099687, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.07125848643416562, + "language_loss": 0.88058704, + "learning_rate": 0.0009580643216162313, + "loss": 0.89293534, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.23803711, + "step": 818, + "time_per_iteration": 2.7225098609924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207, + "balance_loss_mlp": 1.18336058, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.1140894572848919, + "language_loss": 0.79018641, + "learning_rate": 0.0009579393402514652, + "loss": 0.80225646, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.23608398, + "step": 819, + "time_per_iteration": 2.623739004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174289, + "balance_loss_mlp": 1.15172231, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.099553940880187, + "language_loss": 0.90219855, + "learning_rate": 0.0009578141810988801, + "loss": 0.9139415, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.22546387, + "step": 820, + "time_per_iteration": 2.6413519382476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.13443768, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07166699024259414, + "language_loss": 0.90092921, + "learning_rate": 0.0009576888442070668, + "loss": 0.91250455, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.23095703, + "step": 821, + "time_per_iteration": 2.586008310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114947, + "balance_loss_mlp": 1.12679601, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.12314887338256089, + "language_loss": 0.91971326, + "learning_rate": 0.0009575633296246854, + "loss": 0.93120795, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.22668457, + "step": 822, + "time_per_iteration": 2.582914113998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153752, + "balance_loss_mlp": 1.13104272, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.08930039023036396, + "language_loss": 0.83068377, + "learning_rate": 0.0009574376374004652, + "loss": 0.84222132, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.22692871, + "step": 823, + "time_per_iteration": 2.689706563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174252, + "balance_loss_mlp": 1.15108991, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.08166713358237257, + "language_loss": 0.80265462, + "learning_rate": 0.000957311767583204, + "loss": 0.81439716, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.23156738, + "step": 824, + "time_per_iteration": 2.5872888565063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134272, + "balance_loss_mlp": 1.11863208, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.027722115426624477, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83205861, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.15625, + "step": 825, + "time_per_iteration": 4.749661445617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186754, + "balance_loss_mlp": 1.16332912, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0939924469385621, + "language_loss": 0.91145539, + "learning_rate": 0.0009570594953650961, + "loss": 0.92332292, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.23425293, + "step": 826, + "time_per_iteration": 2.5129754543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211327, + "balance_loss_mlp": 1.1879499, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.08032409834180723, + "language_loss": 0.80093443, + "learning_rate": 0.00095693309306219, + "loss": 0.81304777, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.23364258, + "step": 827, + "time_per_iteration": 3.116727352142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203567, + "balance_loss_mlp": 1.17957044, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.07716345894173686, + "language_loss": 0.87652111, + "learning_rate": 0.0009568065133621244, + "loss": 0.88855684, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.23986816, + "step": 828, + "time_per_iteration": 3.3514394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186554, + "balance_loss_mlp": 1.1635462, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.09010150887645839, + "language_loss": 0.84615266, + "learning_rate": 0.0009566797563140422, + "loss": 0.85801816, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.23022461, + "step": 829, + "time_per_iteration": 2.8772377967834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178682, + "balance_loss_mlp": 1.15541196, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.07629618570457763, + "language_loss": 0.87662935, + "learning_rate": 0.0009565528219671547, + "loss": 0.88841611, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.23266602, + "step": 830, + "time_per_iteration": 2.9242594242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168807, + "balance_loss_mlp": 1.14639533, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07916714158721186, + "language_loss": 0.84442008, + "learning_rate": 0.0009564257103707418, + "loss": 0.85610813, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.22424316, + "step": 831, + "time_per_iteration": 2.615751266479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115633, + "balance_loss_mlp": 1.13395441, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.07401424691307211, + "language_loss": 0.9042899, + "learning_rate": 0.0009562984215741533, + "loss": 0.91585314, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.22387695, + "step": 832, + "time_per_iteration": 2.666475296020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143834, + "balance_loss_mlp": 1.12204242, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.07498028486943187, + "language_loss": 0.82129556, + "learning_rate": 0.0009561709556268065, + "loss": 0.83273387, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.21801758, + "step": 833, + "time_per_iteration": 2.757997512817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139242, + "balance_loss_mlp": 1.11768937, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.09759486121205484, + "language_loss": 0.94624776, + "learning_rate": 0.0009560433125781884, + "loss": 0.95764017, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.21569824, + "step": 834, + "time_per_iteration": 2.7897424697875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141895, + "balance_loss_mlp": 1.12007987, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.11927807309460302, + "language_loss": 0.92270857, + "learning_rate": 0.0009559154924778544, + "loss": 0.93412757, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.21838379, + "step": 835, + "time_per_iteration": 2.7300117015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146944, + "balance_loss_mlp": 1.12510526, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.08296253434867956, + "language_loss": 0.85007012, + "learning_rate": 0.0009557874953754284, + "loss": 0.8615396, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.21862793, + "step": 836, + "time_per_iteration": 3.0692667961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171621, + "balance_loss_mlp": 1.15024722, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08629072980134203, + "language_loss": 0.83071995, + "learning_rate": 0.0009556593213206038, + "loss": 0.84243613, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.21374512, + "step": 837, + "time_per_iteration": 2.762371778488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198448, + "balance_loss_mlp": 1.17696667, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.07520594985182873, + "language_loss": 0.8681106, + "learning_rate": 0.0009555309703631414, + "loss": 0.88009512, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.21484375, + "step": 838, + "time_per_iteration": 2.721184253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216338, + "balance_loss_mlp": 1.19352138, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.08529357587841585, + "language_loss": 0.87116075, + "learning_rate": 0.0009554024425528722, + "loss": 0.88332415, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.22802734, + "step": 839, + "time_per_iteration": 2.7104406356811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223488, + "balance_loss_mlp": 1.20211315, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.09500040264705899, + "language_loss": 0.88661861, + "learning_rate": 0.0009552737379396948, + "loss": 0.89885342, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.21386719, + "step": 840, + "time_per_iteration": 2.6247448921203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214716, + "balance_loss_mlp": 1.19292414, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06615948862952871, + "language_loss": 0.87843263, + "learning_rate": 0.0009551448565735767, + "loss": 0.8905797, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.21826172, + "step": 841, + "time_per_iteration": 2.8262698650360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211691, + "balance_loss_mlp": 1.19057953, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.09887794790206932, + "language_loss": 0.8426103, + "learning_rate": 0.0009550157985045543, + "loss": 0.85472721, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.21130371, + "step": 842, + "time_per_iteration": 3.0120604038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206189, + "balance_loss_mlp": 1.18486238, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.08797554821911514, + "language_loss": 0.88739967, + "learning_rate": 0.0009548865637827321, + "loss": 0.89946151, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.21337891, + "step": 843, + "time_per_iteration": 2.6481337547302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204978, + "balance_loss_mlp": 1.18372297, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.09077498619786414, + "language_loss": 0.89573538, + "learning_rate": 0.0009547571524582838, + "loss": 0.90778512, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.21264648, + "step": 844, + "time_per_iteration": 2.5942928791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183078, + "balance_loss_mlp": 1.16156065, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.0818153207807116, + "language_loss": 0.92094475, + "learning_rate": 0.0009546275645814512, + "loss": 0.93277556, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.21533203, + "step": 845, + "time_per_iteration": 2.6533596515655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183489, + "balance_loss_mlp": 1.16250849, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.09434310518898727, + "language_loss": 0.89099437, + "learning_rate": 0.0009544978002025446, + "loss": 0.90282923, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.20983887, + "step": 846, + "time_per_iteration": 2.595737934112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174812, + "balance_loss_mlp": 1.15389085, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.0786790126962769, + "language_loss": 0.86643338, + "learning_rate": 0.0009543678593719434, + "loss": 0.87818146, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.20922852, + "step": 847, + "time_per_iteration": 2.734328508377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172317, + "balance_loss_mlp": 1.1513598, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.07855126038492752, + "language_loss": 0.87300336, + "learning_rate": 0.0009542377421400945, + "loss": 0.88472658, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.2097168, + "step": 848, + "time_per_iteration": 2.8172829151153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168862, + "balance_loss_mlp": 1.14789319, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06818105137358721, + "language_loss": 0.83380383, + "learning_rate": 0.0009541074485575145, + "loss": 0.84549248, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.20983887, + "step": 849, + "time_per_iteration": 2.7554948329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153029, + "balance_loss_mlp": 1.13229823, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07075228162905194, + "language_loss": 0.91935623, + "learning_rate": 0.0009539769786747874, + "loss": 0.93088651, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.20739746, + "step": 850, + "time_per_iteration": 2.681631326675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150714, + "balance_loss_mlp": 1.13010252, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.07677284982742894, + "language_loss": 0.80944598, + "learning_rate": 0.0009538463325425665, + "loss": 0.82095313, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.20617676, + "step": 851, + "time_per_iteration": 2.735233783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154047, + "balance_loss_mlp": 1.13384068, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.11739032058616317, + "language_loss": 0.85686159, + "learning_rate": 0.0009537155102115728, + "loss": 0.86840206, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.20202637, + "step": 852, + "time_per_iteration": 2.620140790939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130528, + "balance_loss_mlp": 1.11065602, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.10634949324965158, + "language_loss": 0.83208728, + "learning_rate": 0.0009535845117325961, + "loss": 0.84339261, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.1986084, + "step": 853, + "time_per_iteration": 2.664644241333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137711, + "balance_loss_mlp": 1.11726654, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.07583670741084705, + "language_loss": 0.9317174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94309455, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.20446777, + "step": 854, + "time_per_iteration": 2.801784038543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132055, + "balance_loss_mlp": 1.11093068, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.10901038327062007, + "language_loss": 0.88220453, + "learning_rate": 0.0009533219865341949, + "loss": 0.89352506, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.21130371, + "step": 855, + "time_per_iteration": 2.5974481105804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145642, + "balance_loss_mlp": 1.12525666, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.08694797679629615, + "language_loss": 0.86617303, + "learning_rate": 0.0009531904599166916, + "loss": 0.87762946, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.20385742, + "step": 856, + "time_per_iteration": 2.6515426635742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165234, + "balance_loss_mlp": 1.1438601, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.10972732987763288, + "language_loss": 0.84639692, + "learning_rate": 0.0009530587573550478, + "loss": 0.85804921, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.21374512, + "step": 857, + "time_per_iteration": 2.5966737270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141097, + "balance_loss_mlp": 1.1243124, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04856663639913232, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75460482, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16796875, + "step": 858, + "time_per_iteration": 5.004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122785, + "balance_loss_mlp": 1.20751262, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.09065075677374754, + "language_loss": 0.89923048, + "learning_rate": 0.0009527948246039337, + "loss": 0.91150904, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.20336914, + "step": 859, + "time_per_iteration": 2.5762951374053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250043, + "balance_loss_mlp": 1.22891951, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.10611361403402562, + "language_loss": 0.87094891, + "learning_rate": 0.000952662594516931, + "loss": 0.88344932, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.21130371, + "step": 860, + "time_per_iteration": 3.1250970363616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211235, + "balance_loss_mlp": 1.19042134, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.07567437441181586, + "language_loss": 0.86383927, + "learning_rate": 0.0009525301886907234, + "loss": 0.87595159, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.20812988, + "step": 861, + "time_per_iteration": 2.8821423053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119851, + "balance_loss_mlp": 1.17725468, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.09117738037536942, + "language_loss": 0.87712085, + "learning_rate": 0.0009523976071767155, + "loss": 0.88910592, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.21252441, + "step": 862, + "time_per_iteration": 2.7509195804595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164214, + "balance_loss_mlp": 1.14342415, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08626936460480303, + "language_loss": 0.87840152, + "learning_rate": 0.00095226485002638, + "loss": 0.89004362, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.20800781, + "step": 863, + "time_per_iteration": 2.835188150405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148836, + "balance_loss_mlp": 1.12823641, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.09501605355258884, + "language_loss": 0.88929522, + "learning_rate": 0.0009521319172912576, + "loss": 0.90078366, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.20605469, + "step": 864, + "time_per_iteration": 2.773681879043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148606, + "balance_loss_mlp": 1.12822115, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.1262431233900787, + "language_loss": 0.94519138, + "learning_rate": 0.0009519988090229579, + "loss": 0.95667744, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.20385742, + "step": 865, + "time_per_iteration": 2.7055397033691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134112, + "balance_loss_mlp": 1.11327457, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.10486068908473449, + "language_loss": 0.87655658, + "learning_rate": 0.0009518655252731576, + "loss": 0.88789773, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.20849609, + "step": 866, + "time_per_iteration": 2.774974822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124242, + "balance_loss_mlp": 1.102844, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.09006182482731041, + "language_loss": 0.90070617, + "learning_rate": 0.0009517320660936022, + "loss": 0.91194862, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.2142334, + "step": 867, + "time_per_iteration": 2.7388041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126447, + "balance_loss_mlp": 1.1068728, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.09548967470897408, + "language_loss": 0.82877147, + "learning_rate": 0.0009515984315361051, + "loss": 0.84003592, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.19555664, + "step": 868, + "time_per_iteration": 2.822772264480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.11205709, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.10934486098426227, + "language_loss": 0.86598766, + "learning_rate": 0.000951464621652548, + "loss": 0.87731194, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.20373535, + "step": 869, + "time_per_iteration": 2.648505687713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159804, + "balance_loss_mlp": 1.13964605, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.11951376597850719, + "language_loss": 0.7861675, + "learning_rate": 0.0009513306364948804, + "loss": 0.79776561, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.20153809, + "step": 870, + "time_per_iteration": 2.781686305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188864, + "balance_loss_mlp": 1.16833639, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09165243347067362, + "language_loss": 0.88987041, + "learning_rate": 0.0009511964761151197, + "loss": 0.90175903, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.2052002, + "step": 871, + "time_per_iteration": 2.5691447257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122616, + "balance_loss_mlp": 1.20546532, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.09901550717192838, + "language_loss": 0.90224719, + "learning_rate": 0.0009510621405653521, + "loss": 0.91450876, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.20690918, + "step": 872, + "time_per_iteration": 2.585707426071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191078, + "balance_loss_mlp": 1.17049098, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.11167023861469132, + "language_loss": 0.83886391, + "learning_rate": 0.0009509276298977309, + "loss": 0.85077471, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.20581055, + "step": 873, + "time_per_iteration": 2.970672607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177127, + "balance_loss_mlp": 1.15688562, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09073459995989616, + "language_loss": 0.81845176, + "learning_rate": 0.0009507929441644778, + "loss": 0.83022296, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.20239258, + "step": 874, + "time_per_iteration": 3.5511813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137436, + "balance_loss_mlp": 1.11694419, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09068306382456774, + "language_loss": 0.85649496, + "learning_rate": 0.0009506580834178826, + "loss": 0.86786938, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.20495605, + "step": 875, + "time_per_iteration": 2.797485589981079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130428, + "balance_loss_mlp": 1.10986471, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.09154973704283995, + "language_loss": 0.91347295, + "learning_rate": 0.0009505230477103028, + "loss": 0.92477721, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.20568848, + "step": 876, + "time_per_iteration": 2.70495867729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145788, + "balance_loss_mlp": 1.12518883, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10157591470828177, + "language_loss": 0.8152402, + "learning_rate": 0.0009503878370941641, + "loss": 0.82669806, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.20593262, + "step": 877, + "time_per_iteration": 2.735748052597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151178, + "balance_loss_mlp": 1.13084054, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.15099055549540594, + "language_loss": 0.88741207, + "learning_rate": 0.0009502524516219595, + "loss": 0.89892387, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.20336914, + "step": 878, + "time_per_iteration": 2.730163812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160201, + "balance_loss_mlp": 1.13942301, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.11693714010182361, + "language_loss": 0.9004457, + "learning_rate": 0.0009501168913462506, + "loss": 0.91204774, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.20788574, + "step": 879, + "time_per_iteration": 2.684805393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088136, + "balance_loss_mlp": 1.07440281, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04309817230007909, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80210066, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.13769531, + "step": 880, + "time_per_iteration": 4.804383039474487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166032, + "balance_loss_mlp": 1.14521825, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08467938058221719, + "language_loss": 0.85053843, + "learning_rate": 0.0009498452465949042, + "loss": 0.86219883, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.20825195, + "step": 881, + "time_per_iteration": 3.276735305786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201291, + "balance_loss_mlp": 1.17981005, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06992657838118156, + "language_loss": 0.91281927, + "learning_rate": 0.0009497091622247285, + "loss": 0.92483222, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.21484375, + "step": 882, + "time_per_iteration": 2.70647931098938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200556, + "balance_loss_mlp": 1.17901504, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.0696336676613267, + "language_loss": 0.93377209, + "learning_rate": 0.0009495729032619723, + "loss": 0.94577771, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.2154541, + "step": 883, + "time_per_iteration": 2.7534360885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227373, + "balance_loss_mlp": 1.20546222, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.08705372199297186, + "language_loss": 0.83726418, + "learning_rate": 0.0009494364697595354, + "loss": 0.84953797, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.21923828, + "step": 884, + "time_per_iteration": 2.9550111293792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242229, + "balance_loss_mlp": 1.22078347, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08532836159387652, + "language_loss": 0.89805126, + "learning_rate": 0.0009492998617703867, + "loss": 0.91047359, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.21472168, + "step": 885, + "time_per_iteration": 2.710296154022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216994, + "balance_loss_mlp": 1.19604921, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.08218136336540412, + "language_loss": 0.87561512, + "learning_rate": 0.0009491630793475619, + "loss": 0.88778508, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.20959473, + "step": 886, + "time_per_iteration": 2.6574454307556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223448, + "balance_loss_mlp": 1.20190716, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.06673733954054763, + "language_loss": 0.85054195, + "learning_rate": 0.0009490261225441643, + "loss": 0.8627764, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.2154541, + "step": 887, + "time_per_iteration": 2.9003562927246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209094, + "balance_loss_mlp": 1.18812537, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07511336927499353, + "language_loss": 0.89910543, + "learning_rate": 0.0009488889914133656, + "loss": 0.91119635, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.2097168, + "step": 888, + "time_per_iteration": 2.9909205436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121642, + "balance_loss_mlp": 1.19492674, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07825003291748035, + "language_loss": 0.88796103, + "learning_rate": 0.0009487516860084047, + "loss": 0.90012527, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.21496582, + "step": 889, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192765, + "balance_loss_mlp": 1.17159319, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.10600638107264272, + "language_loss": 0.88708925, + "learning_rate": 0.0009486142063825884, + "loss": 0.89901692, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.21179199, + "step": 890, + "time_per_iteration": 2.583644390106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212506, + "balance_loss_mlp": 1.19724751, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.09034147523214399, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73638725, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15234375, + "step": 891, + "time_per_iteration": 4.9979774951934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175422, + "balance_loss_mlp": 1.1550256, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.1258166683219009, + "language_loss": 0.89561093, + "learning_rate": 0.0009483387246819542, + "loss": 0.9073652, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.20397949, + "step": 892, + "time_per_iteration": 2.7332327365875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068395, + "balance_loss_mlp": 1.05304134, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03219618488122811, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83353972, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.15332031, + "step": 893, + "time_per_iteration": 4.691076993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142338, + "balance_loss_mlp": 1.12172627, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.0974140714584663, + "language_loss": 0.88822401, + "learning_rate": 0.0009480625467392688, + "loss": 0.89964741, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.20617676, + "step": 894, + "time_per_iteration": 2.646313190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_mlp": 1.02080703, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.032237767215918686, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79031026, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15527344, + "step": 895, + "time_per_iteration": 4.73791241645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134198, + "balance_loss_mlp": 1.11333644, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.07818472841262332, + "language_loss": 0.8733896, + "learning_rate": 0.0009477856729834196, + "loss": 0.88473153, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.20874023, + "step": 896, + "time_per_iteration": 2.7401630878448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132796, + "balance_loss_mlp": 1.11235166, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.07866740874279901, + "language_loss": 0.89730608, + "learning_rate": 0.0009476469753098809, + "loss": 0.90863407, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.20446777, + "step": 897, + "time_per_iteration": 2.7601003646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141178, + "balance_loss_mlp": 1.12072182, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08200394390051394, + "language_loss": 0.86714321, + "learning_rate": 0.0009475081038443738, + "loss": 0.878555, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.20458984, + "step": 898, + "time_per_iteration": 2.621018171310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137375, + "balance_loss_mlp": 1.11602414, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.07995623076613839, + "language_loss": 0.85080326, + "learning_rate": 0.0009473690586408124, + "loss": 0.86217701, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.21374512, + "step": 899, + "time_per_iteration": 2.8553502559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149397, + "balance_loss_mlp": 1.12811828, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08690536389731517, + "language_loss": 0.85954648, + "learning_rate": 0.0009472298397531792, + "loss": 0.87104046, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.21276855, + "step": 900, + "time_per_iteration": 2.7427260875701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.12017393, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.12119389218583115, + "language_loss": 0.86411273, + "learning_rate": 0.0009470904472355235, + "loss": 0.87553239, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.21801758, + "step": 901, + "time_per_iteration": 2.6585657596588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138192, + "balance_loss_mlp": 1.11563718, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08947887393013387, + "language_loss": 0.79425454, + "learning_rate": 0.0009469508811419626, + "loss": 0.80563653, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.22570801, + "step": 902, + "time_per_iteration": 2.725372791290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207563, + "balance_loss_mlp": 1.1882031, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.06736803575768126, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72821391, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.819333553314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138702, + "balance_loss_mlp": 1.1156832, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.10583475939723401, + "language_loss": 0.83563209, + "learning_rate": 0.0009466712284439292, + "loss": 0.84701914, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.23022461, + "step": 904, + "time_per_iteration": 2.7723944187164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136442, + "balance_loss_mlp": 1.11426902, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.09911822478323383, + "language_loss": 0.88385195, + "learning_rate": 0.0009465311419480276, + "loss": 0.89521635, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.22180176, + "step": 905, + "time_per_iteration": 2.708866596221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161825, + "balance_loss_mlp": 1.14012873, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.07480170707629828, + "language_loss": 0.88125765, + "learning_rate": 0.0009463908820933622, + "loss": 0.89287591, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.21704102, + "step": 906, + "time_per_iteration": 2.8386967182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165754, + "balance_loss_mlp": 1.1450001, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.09057770875445449, + "language_loss": 0.82559198, + "learning_rate": 0.0009462504489343868, + "loss": 0.83724952, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.20751953, + "step": 907, + "time_per_iteration": 2.8287012577056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182859, + "balance_loss_mlp": 1.16167533, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0967031701007891, + "language_loss": 0.88244259, + "learning_rate": 0.0009461098425256222, + "loss": 0.89427125, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.21203613, + "step": 908, + "time_per_iteration": 2.636411190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184624, + "balance_loss_mlp": 1.16438186, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08569423221876828, + "language_loss": 0.85917675, + "learning_rate": 0.0009459690629216567, + "loss": 0.87102294, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.20239258, + "step": 909, + "time_per_iteration": 2.6774063110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185319, + "balance_loss_mlp": 1.16585207, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06867787211129477, + "language_loss": 0.87373209, + "learning_rate": 0.0009458281101771457, + "loss": 0.88558531, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.19445801, + "step": 910, + "time_per_iteration": 2.6256136894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183744, + "balance_loss_mlp": 1.16421723, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.07423161751862324, + "language_loss": 0.82895565, + "learning_rate": 0.0009456869843468122, + "loss": 0.84079307, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.19519043, + "step": 911, + "time_per_iteration": 2.8429157733917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181199, + "balance_loss_mlp": 1.16098118, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.10560425483963332, + "language_loss": 0.78068089, + "learning_rate": 0.0009455456854854459, + "loss": 0.79249287, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.20214844, + "step": 912, + "time_per_iteration": 2.6220157146453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161811, + "balance_loss_mlp": 1.1425947, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.07427056945907796, + "language_loss": 0.84015787, + "learning_rate": 0.0009454042136479039, + "loss": 0.851776, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.19189453, + "step": 913, + "time_per_iteration": 2.5928330421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170659, + "balance_loss_mlp": 1.15183616, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.08169247609196438, + "language_loss": 0.82556438, + "learning_rate": 0.0009452625688891103, + "loss": 0.83727098, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.18798828, + "step": 914, + "time_per_iteration": 2.5541818141937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215011, + "balance_loss_mlp": 1.20032406, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.06355474766062214, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79949749, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.14648438, + "step": 915, + "time_per_iteration": 4.609099864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151019, + "balance_loss_mlp": 1.13170671, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08978748093655645, + "language_loss": 0.92478371, + "learning_rate": 0.0009449787608278015, + "loss": 0.9362939, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.19299316, + "step": 916, + "time_per_iteration": 2.8081016540527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144026, + "balance_loss_mlp": 1.12480903, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08892608358050076, + "language_loss": 0.9215048, + "learning_rate": 0.0009448365976354704, + "loss": 0.93294501, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.19213867, + "step": 917, + "time_per_iteration": 2.5476417541503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141081, + "balance_loss_mlp": 1.12047005, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.10930526403118525, + "language_loss": 0.89404565, + "learning_rate": 0.0009446942617422558, + "loss": 0.90545642, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.20617676, + "step": 918, + "time_per_iteration": 2.6054670810699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159521, + "balance_loss_mlp": 1.13917232, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.08039502929266268, + "language_loss": 0.84809625, + "learning_rate": 0.0009445517532034176, + "loss": 0.85969138, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.20349121, + "step": 919, + "time_per_iteration": 2.736720561981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116126, + "balance_loss_mlp": 1.14050603, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09960932315337, + "language_loss": 0.88503635, + "learning_rate": 0.0009444090720742824, + "loss": 0.89664894, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.20751953, + "step": 920, + "time_per_iteration": 2.5981345176696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118497, + "balance_loss_mlp": 1.16263032, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.09080242050592086, + "language_loss": 0.87781966, + "learning_rate": 0.0009442662184102439, + "loss": 0.88966942, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.22351074, + "step": 921, + "time_per_iteration": 2.855386972427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195366, + "balance_loss_mlp": 1.17316878, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07657240030806824, + "language_loss": 0.86990869, + "learning_rate": 0.000944123192266763, + "loss": 0.88186234, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.22216797, + "step": 922, + "time_per_iteration": 2.862642526626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184819, + "balance_loss_mlp": 1.16284895, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.09417779830391854, + "language_loss": 0.83500814, + "learning_rate": 0.0009439799936993671, + "loss": 0.8468563, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.21960449, + "step": 923, + "time_per_iteration": 2.7609872817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172194, + "balance_loss_mlp": 1.1505692, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.183012338078167, + "language_loss": 0.87992036, + "learning_rate": 0.0009438366227636511, + "loss": 0.89164221, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.21630859, + "step": 924, + "time_per_iteration": 2.680379867553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147406, + "balance_loss_mlp": 1.12692571, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07052119854018758, + "language_loss": 0.8590064, + "learning_rate": 0.0009436930795152763, + "loss": 0.87048048, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.20483398, + "step": 925, + "time_per_iteration": 2.84305477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134796, + "balance_loss_mlp": 1.11461377, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.10542322310235813, + "language_loss": 0.86425805, + "learning_rate": 0.0009435493640099713, + "loss": 0.875606, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.20178223, + "step": 926, + "time_per_iteration": 2.8326363563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147089, + "balance_loss_mlp": 1.12663293, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.1030160256649362, + "language_loss": 0.83799899, + "learning_rate": 0.0009434054763035314, + "loss": 0.8494699, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.20458984, + "step": 927, + "time_per_iteration": 2.6224582195281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142086, + "balance_loss_mlp": 1.12232113, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0964966031181637, + "language_loss": 0.85150439, + "learning_rate": 0.0009432614164518185, + "loss": 0.86292523, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.19750977, + "step": 928, + "time_per_iteration": 2.989607810974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115354, + "balance_loss_mlp": 1.13345337, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.11261525147662245, + "language_loss": 0.84222531, + "learning_rate": 0.000943117184510762, + "loss": 0.85376072, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.20080566, + "step": 929, + "time_per_iteration": 3.0107991695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167369, + "balance_loss_mlp": 1.15220594, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.0706795740425107, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7995733, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.15136719, + "step": 930, + "time_per_iteration": 5.0069990158081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168853, + "balance_loss_mlp": 1.14890909, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.0722944763131068, + "language_loss": 0.885297, + "learning_rate": 0.0009428282045846674, + "loss": 0.89698553, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.19934082, + "step": 931, + "time_per_iteration": 2.705216884613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173254, + "balance_loss_mlp": 1.15314293, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.06808586729698768, + "language_loss": 0.89063865, + "learning_rate": 0.0009426834567118214, + "loss": 0.90237117, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.2010498, + "step": 932, + "time_per_iteration": 3.1137044429779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179521, + "balance_loss_mlp": 1.16003084, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.07690698304216284, + "language_loss": 0.80337363, + "learning_rate": 0.0009425385369740155, + "loss": 0.81516886, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.19470215, + "step": 933, + "time_per_iteration": 3.0430078506469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186692, + "balance_loss_mlp": 1.16659284, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.10248881334159239, + "language_loss": 0.86684513, + "learning_rate": 0.0009423934454275125, + "loss": 0.87871206, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.20092773, + "step": 934, + "time_per_iteration": 2.888127565383911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171583, + "balance_loss_mlp": 1.15185428, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.08181978587800019, + "language_loss": 0.91464841, + "learning_rate": 0.0009422481821286418, + "loss": 0.92636418, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.19714355, + "step": 935, + "time_per_iteration": 2.725064516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.13605165, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.08977099192722084, + "language_loss": 0.87336344, + "learning_rate": 0.0009421027471337998, + "loss": 0.88491625, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.19213867, + "step": 936, + "time_per_iteration": 2.64992356300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153899, + "balance_loss_mlp": 1.13451552, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.08166389785278784, + "language_loss": 0.82045889, + "learning_rate": 0.0009419571404994493, + "loss": 0.83199793, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.19360352, + "step": 937, + "time_per_iteration": 2.6302027702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140367, + "balance_loss_mlp": 1.12045932, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10573813889003272, + "language_loss": 0.9057107, + "learning_rate": 0.00094181136228212, + "loss": 0.91711438, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.19909668, + "step": 938, + "time_per_iteration": 2.6472811698913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146966, + "balance_loss_mlp": 1.12671292, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.10223057205117164, + "language_loss": 0.85864574, + "learning_rate": 0.0009416654125384077, + "loss": 0.8701154, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.20251465, + "step": 939, + "time_per_iteration": 2.7523345947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100615, + "balance_loss_mlp": 1.08507037, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.03692949506691956, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80872989, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15527344, + "step": 940, + "time_per_iteration": 4.95509147644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139639, + "balance_loss_mlp": 1.1185863, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.07658245982623446, + "language_loss": 0.83579218, + "learning_rate": 0.000941372998698552, + "loss": 0.84718859, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.21057129, + "step": 941, + "time_per_iteration": 3.022993326187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152659, + "balance_loss_mlp": 1.13134432, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.08701506300356623, + "language_loss": 0.81278259, + "learning_rate": 0.0009412265347159336, + "loss": 0.82430923, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.21325684, + "step": 942, + "time_per_iteration": 2.7516462802886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.11446774, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.09990043941217396, + "language_loss": 0.84286022, + "learning_rate": 0.0009410798994339829, + "loss": 0.85422134, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.21655273, + "step": 943, + "time_per_iteration": 2.619678258895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125702, + "balance_loss_mlp": 1.10438752, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.0907244307429491, + "language_loss": 0.87645197, + "learning_rate": 0.000940933092909628, + "loss": 0.88770896, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.21337891, + "step": 944, + "time_per_iteration": 2.5915796756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137514, + "balance_loss_mlp": 1.11566281, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07468252045243974, + "language_loss": 0.8361553, + "learning_rate": 0.0009407861151998649, + "loss": 0.84753042, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.21838379, + "step": 945, + "time_per_iteration": 2.597646713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146698, + "balance_loss_mlp": 1.12490702, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07893028842648955, + "language_loss": 0.85781825, + "learning_rate": 0.0009406389663617552, + "loss": 0.86928523, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.21789551, + "step": 946, + "time_per_iteration": 2.6909499168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157426, + "balance_loss_mlp": 1.1367197, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0883302731715351, + "language_loss": 0.85250366, + "learning_rate": 0.000940491646452427, + "loss": 0.86407793, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.20703125, + "step": 947, + "time_per_iteration": 2.7548892498016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188123, + "balance_loss_mlp": 1.16742826, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.09521178511918296, + "language_loss": 0.9039495, + "learning_rate": 0.000940344155529075, + "loss": 0.91583067, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.20690918, + "step": 948, + "time_per_iteration": 2.6882100105285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214568, + "balance_loss_mlp": 1.19396889, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.12174210826928723, + "language_loss": 0.86923814, + "learning_rate": 0.0009401964936489605, + "loss": 0.88138384, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.20605469, + "step": 949, + "time_per_iteration": 2.5339841842651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199663, + "balance_loss_mlp": 1.18013692, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.0789508013524053, + "language_loss": 0.85218668, + "learning_rate": 0.0009400486608694108, + "loss": 0.86418331, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.19506836, + "step": 950, + "time_per_iteration": 2.7437641620635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173826, + "balance_loss_mlp": 1.15394247, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.08777486633127113, + "language_loss": 0.87155032, + "learning_rate": 0.0009399006572478195, + "loss": 0.88328856, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.19873047, + "step": 951, + "time_per_iteration": 3.1146392822265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151812, + "balance_loss_mlp": 1.1324048, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06965363368279433, + "language_loss": 0.90749818, + "learning_rate": 0.0009397524828416468, + "loss": 0.91901636, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.19384766, + "step": 952, + "time_per_iteration": 2.7005960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150163, + "balance_loss_mlp": 1.13092208, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.08371144384200242, + "language_loss": 0.95721734, + "learning_rate": 0.0009396041377084192, + "loss": 0.96871901, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.19226074, + "step": 953, + "time_per_iteration": 2.65962290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143055, + "balance_loss_mlp": 1.12399304, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07808709569264205, + "language_loss": 0.87208664, + "learning_rate": 0.0009394556219057295, + "loss": 0.88351727, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.19055176, + "step": 954, + "time_per_iteration": 2.7021074295043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146054, + "balance_loss_mlp": 1.12665915, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.0732836103686164, + "language_loss": 0.83296251, + "learning_rate": 0.0009393069354912362, + "loss": 0.84442306, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.19372559, + "step": 955, + "time_per_iteration": 2.7472946643829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146804, + "balance_loss_mlp": 1.12801623, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07466806963668332, + "language_loss": 0.81601501, + "learning_rate": 0.0009391580785226649, + "loss": 0.827483, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.18798828, + "step": 956, + "time_per_iteration": 2.865922212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.07007885, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.04640489893855834, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80424643, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.14160156, + "step": 957, + "time_per_iteration": 4.8100152015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.13656831, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.08641924144795167, + "language_loss": 0.86033231, + "learning_rate": 0.0009388598531545196, + "loss": 0.87189424, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.19604492, + "step": 958, + "time_per_iteration": 2.879993438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162855, + "balance_loss_mlp": 1.14316201, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08295253694800603, + "language_loss": 0.85064113, + "learning_rate": 0.000938710484870727, + "loss": 0.8622697, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.19677734, + "step": 959, + "time_per_iteration": 2.6058270931243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169469, + "balance_loss_mlp": 1.14974046, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0909196929102129, + "language_loss": 0.85416096, + "learning_rate": 0.0009385609462644189, + "loss": 0.86585563, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.19714355, + "step": 960, + "time_per_iteration": 4.22582483291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116162, + "balance_loss_mlp": 1.14138985, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.0839924836083711, + "language_loss": 0.8550421, + "learning_rate": 0.0009384112373936514, + "loss": 0.86665827, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.20227051, + "step": 961, + "time_per_iteration": 2.6566050052642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161461, + "balance_loss_mlp": 1.14142191, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.0943693164808434, + "language_loss": 0.90989888, + "learning_rate": 0.0009382613583165467, + "loss": 0.92151344, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.20031738, + "step": 962, + "time_per_iteration": 2.823707103729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115093, + "balance_loss_mlp": 1.13110566, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07960710886198098, + "language_loss": 0.89083374, + "learning_rate": 0.0009381113090912928, + "loss": 0.90234309, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.19824219, + "step": 963, + "time_per_iteration": 2.760617733001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113196, + "balance_loss_mlp": 1.11194444, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.09269195293936518, + "language_loss": 0.89102614, + "learning_rate": 0.000937961089776144, + "loss": 0.90234572, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.20007324, + "step": 964, + "time_per_iteration": 2.637064218521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137413, + "balance_loss_mlp": 1.11674166, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09284731320409442, + "language_loss": 0.82889503, + "learning_rate": 0.0009378107004294208, + "loss": 0.84026921, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.20678711, + "step": 965, + "time_per_iteration": 2.9863977432250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133143, + "balance_loss_mlp": 1.11312819, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08496740626071231, + "language_loss": 0.90790451, + "learning_rate": 0.0009376601411095096, + "loss": 0.91923594, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.20007324, + "step": 966, + "time_per_iteration": 2.68448543548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118077, + "balance_loss_mlp": 1.09840786, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07860547413279617, + "language_loss": 0.8636961, + "learning_rate": 0.0009375094118748622, + "loss": 0.87487686, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.1965332, + "step": 967, + "time_per_iteration": 2.6023223400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116788, + "balance_loss_mlp": 1.09746408, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.09121906518800267, + "language_loss": 0.90388292, + "learning_rate": 0.0009373585127839976, + "loss": 0.91505075, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.19299316, + "step": 968, + "time_per_iteration": 2.9992241859436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128327, + "balance_loss_mlp": 1.10974205, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08500834593637788, + "language_loss": 0.90474886, + "learning_rate": 0.0009372074438954994, + "loss": 0.91603214, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.18579102, + "step": 969, + "time_per_iteration": 2.6900458335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129744, + "balance_loss_mlp": 1.11119485, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.07463307704621708, + "language_loss": 0.91465181, + "learning_rate": 0.0009370562052680181, + "loss": 0.92594928, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.18554688, + "step": 970, + "time_per_iteration": 2.4830586910247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118559, + "balance_loss_mlp": 1.10014117, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.0879562727670826, + "language_loss": 0.89281493, + "learning_rate": 0.0009369047969602695, + "loss": 0.90400052, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.18432617, + "step": 971, + "time_per_iteration": 2.745058298110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126638, + "balance_loss_mlp": 1.10707593, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.10844584745321367, + "language_loss": 0.862324, + "learning_rate": 0.0009367532190310357, + "loss": 0.87359041, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.19543457, + "step": 972, + "time_per_iteration": 2.6137964725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113488, + "balance_loss_mlp": 1.09404469, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.07658656218276177, + "language_loss": 0.88875228, + "learning_rate": 0.0009366014715391644, + "loss": 0.8998872, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.19433594, + "step": 973, + "time_per_iteration": 2.6654906272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112059, + "balance_loss_mlp": 1.09299731, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.11180851981284076, + "language_loss": 0.83713347, + "learning_rate": 0.0009364495545435693, + "loss": 0.84825402, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.19055176, + "step": 974, + "time_per_iteration": 2.801388740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120051, + "balance_loss_mlp": 1.1004051, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06978545014802194, + "language_loss": 0.87871438, + "learning_rate": 0.0009362974681032297, + "loss": 0.88991487, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.19628906, + "step": 975, + "time_per_iteration": 2.6227941513061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124993, + "balance_loss_mlp": 1.10491848, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.08030171004504767, + "language_loss": 0.88050348, + "learning_rate": 0.0009361452122771907, + "loss": 0.89175344, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.20080566, + "step": 976, + "time_per_iteration": 2.899641752243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139865, + "balance_loss_mlp": 1.1185981, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.09158450212133555, + "language_loss": 0.82837689, + "learning_rate": 0.0009359927871245635, + "loss": 0.8397755, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.21289062, + "step": 977, + "time_per_iteration": 2.5095362663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.12616336, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.08436158367459867, + "language_loss": 0.86086357, + "learning_rate": 0.0009358401927045246, + "loss": 0.8723408, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.21569824, + "step": 978, + "time_per_iteration": 2.880329132080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115214, + "balance_loss_mlp": 1.12937117, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.14896441210102726, + "language_loss": 0.881185, + "learning_rate": 0.0009356874290763166, + "loss": 0.89270639, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.22753906, + "step": 979, + "time_per_iteration": 3.519901990890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146434, + "balance_loss_mlp": 1.12485671, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.08194638070334626, + "language_loss": 0.88670301, + "learning_rate": 0.0009355344962992474, + "loss": 0.89816737, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.21606445, + "step": 980, + "time_per_iteration": 2.638364553451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137899, + "balance_loss_mlp": 1.11571455, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07836652437453029, + "language_loss": 0.8762567, + "learning_rate": 0.0009353813944326908, + "loss": 0.88763571, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.22180176, + "step": 981, + "time_per_iteration": 2.963667869567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131587, + "balance_loss_mlp": 1.10924709, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.08486883897693408, + "language_loss": 0.82728517, + "learning_rate": 0.0009352281235360863, + "loss": 0.83860105, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.22338867, + "step": 982, + "time_per_iteration": 2.752194404602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146713, + "balance_loss_mlp": 1.12631679, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08390803894001939, + "language_loss": 0.84704804, + "learning_rate": 0.0009350746836689389, + "loss": 0.85851514, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.20385742, + "step": 983, + "time_per_iteration": 2.572817325592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.13550532, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06256828552174507, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.8258903, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.14257812, + "step": 984, + "time_per_iteration": 5.0779805183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126626, + "balance_loss_mlp": 1.10678935, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08472556529064418, + "language_loss": 0.82448637, + "learning_rate": 0.0009347672972613634, + "loss": 0.83575261, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.19824219, + "step": 985, + "time_per_iteration": 2.615293502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113053, + "balance_loss_mlp": 1.11202836, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.06995806836739982, + "language_loss": 0.8510493, + "learning_rate": 0.0009346133508402735, + "loss": 0.86235464, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.18469238, + "step": 986, + "time_per_iteration": 2.729766845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145502, + "balance_loss_mlp": 1.12719178, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07783152768123536, + "language_loss": 0.83385336, + "learning_rate": 0.0009344592356873166, + "loss": 0.84530836, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.18322754, + "step": 987, + "time_per_iteration": 2.642298698425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142552, + "balance_loss_mlp": 1.12420571, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.1311760581731783, + "language_loss": 0.78159761, + "learning_rate": 0.0009343049518623255, + "loss": 0.79302317, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.18359375, + "step": 988, + "time_per_iteration": 2.7496607303619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147502, + "balance_loss_mlp": 1.12969208, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07011475213003748, + "language_loss": 0.82941067, + "learning_rate": 0.0009341504994251985, + "loss": 0.8408857, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.17822266, + "step": 989, + "time_per_iteration": 2.850295305252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154172, + "balance_loss_mlp": 1.13986683, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.061552691423840886, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74674672, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.14257812, + "step": 990, + "time_per_iteration": 5.020269393920898 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160468, + "balance_loss_mlp": 1.14208579, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.07610354532645859, + "language_loss": 0.81556082, + "learning_rate": 0.0009338410889544574, + "loss": 0.82716548, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.18383789, + "step": 991, + "time_per_iteration": 3.0640664100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159788, + "balance_loss_mlp": 1.14151347, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07533691574431517, + "language_loss": 0.87469906, + "learning_rate": 0.000933686131040967, + "loss": 0.88629693, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.18273926, + "step": 992, + "time_per_iteration": 2.8369646072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153048, + "balance_loss_mlp": 1.13516688, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.2292689794441624, + "language_loss": 0.90069616, + "learning_rate": 0.0009335310047555883, + "loss": 0.91222656, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.17895508, + "step": 993, + "time_per_iteration": 2.7662436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201823, + "balance_loss_mlp": 1.18303561, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.08969446374481721, + "language_loss": 0.87941462, + "learning_rate": 0.0009333757101585467, + "loss": 0.89143288, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.18786621, + "step": 994, + "time_per_iteration": 2.6766159534454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248094, + "balance_loss_mlp": 1.22967577, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.09684982281817384, + "language_loss": 0.93064606, + "learning_rate": 0.0009332202473101329, + "loss": 0.94312704, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.1842041, + "step": 995, + "time_per_iteration": 2.6848959922790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124761, + "balance_loss_mlp": 1.22866774, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.14945399887744149, + "language_loss": 0.82354605, + "learning_rate": 0.0009330646162707028, + "loss": 0.83602214, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.18933105, + "step": 996, + "time_per_iteration": 2.7672605514526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120105, + "balance_loss_mlp": 1.18239403, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.09345568382872575, + "language_loss": 0.83716351, + "learning_rate": 0.0009329088171006779, + "loss": 0.84917402, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.18664551, + "step": 997, + "time_per_iteration": 3.177269697189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171291, + "balance_loss_mlp": 1.15201521, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09261663839867938, + "language_loss": 0.85307527, + "learning_rate": 0.0009327528498605446, + "loss": 0.86478817, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.19274902, + "step": 998, + "time_per_iteration": 2.5818471908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136553, + "balance_loss_mlp": 1.11700296, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.11232924304021881, + "language_loss": 0.89184988, + "learning_rate": 0.0009325967146108548, + "loss": 0.90321541, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.1953125, + "step": 999, + "time_per_iteration": 2.672342300415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141697, + "balance_loss_mlp": 1.12257588, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.11996696196806446, + "language_loss": 0.87541509, + "learning_rate": 0.0009324404114122258, + "loss": 0.88683212, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.19104004, + "step": 1000, + "time_per_iteration": 2.7652101516723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142189, + "balance_loss_mlp": 1.12290096, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.09563187877453348, + "language_loss": 0.86816871, + "learning_rate": 0.0009322839403253397, + "loss": 0.87959063, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.19274902, + "step": 1001, + "time_per_iteration": 2.7855865955352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113304, + "balance_loss_mlp": 1.11353719, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.0964526780140198, + "language_loss": 0.8374511, + "learning_rate": 0.0009321273014109439, + "loss": 0.84878153, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.19494629, + "step": 1002, + "time_per_iteration": 2.9773457050323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137462, + "balance_loss_mlp": 1.11835289, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.07256968924395192, + "language_loss": 0.8405087, + "learning_rate": 0.0009319704947298513, + "loss": 0.85188329, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.19104004, + "step": 1003, + "time_per_iteration": 2.8997581005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144905, + "balance_loss_mlp": 1.12630868, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.15770574603346119, + "language_loss": 0.88051564, + "learning_rate": 0.0009318135203429393, + "loss": 0.89196467, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.18579102, + "step": 1004, + "time_per_iteration": 4.269490957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156184, + "balance_loss_mlp": 1.13703942, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.08756431218137971, + "language_loss": 0.87512451, + "learning_rate": 0.0009316563783111511, + "loss": 0.88668633, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.19128418, + "step": 1005, + "time_per_iteration": 2.741323471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164816, + "balance_loss_mlp": 1.14583826, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06803118553980413, + "language_loss": 0.81866097, + "learning_rate": 0.0009314990686954943, + "loss": 0.83030909, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.18969727, + "step": 1006, + "time_per_iteration": 2.955195903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198123, + "balance_loss_mlp": 1.1794908, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.08085614110860996, + "language_loss": 0.80862725, + "learning_rate": 0.000931341591557042, + "loss": 0.8206085, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.18615723, + "step": 1007, + "time_per_iteration": 3.74294114112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192787, + "balance_loss_mlp": 1.17408264, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.10092207476563657, + "language_loss": 0.87274837, + "learning_rate": 0.0009311839469569325, + "loss": 0.88467628, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.18701172, + "step": 1008, + "time_per_iteration": 2.7143359184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188299, + "balance_loss_mlp": 1.16947544, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.10252165229611418, + "language_loss": 0.86257041, + "learning_rate": 0.0009310261349563687, + "loss": 0.87445343, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.18823242, + "step": 1009, + "time_per_iteration": 2.7420098781585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156356, + "balance_loss_mlp": 1.13825965, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.06920598095285249, + "language_loss": 0.8520751, + "learning_rate": 0.0009308681556166186, + "loss": 0.86363864, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.18103027, + "step": 1010, + "time_per_iteration": 2.8593883514404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162311, + "balance_loss_mlp": 1.14391661, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10589580567356643, + "language_loss": 0.87318867, + "learning_rate": 0.0009307100089990152, + "loss": 0.88481176, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.18408203, + "step": 1011, + "time_per_iteration": 2.7444002628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144739, + "balance_loss_mlp": 1.12624931, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.10287575048528846, + "language_loss": 0.83773112, + "learning_rate": 0.0009305516951649568, + "loss": 0.84917855, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.18481445, + "step": 1012, + "time_per_iteration": 2.7355475425720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174282, + "balance_loss_mlp": 1.15630519, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07063143065951083, + "language_loss": 0.86586678, + "learning_rate": 0.0009303932141759057, + "loss": 0.87760961, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.17980957, + "step": 1013, + "time_per_iteration": 2.778740882873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166591, + "balance_loss_mlp": 1.14829278, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.09801085242945827, + "language_loss": 0.83495271, + "learning_rate": 0.0009302345660933902, + "loss": 0.84661865, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.18286133, + "step": 1014, + "time_per_iteration": 2.8084325790405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178039, + "balance_loss_mlp": 1.1603483, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.1010340318018862, + "language_loss": 0.84950441, + "learning_rate": 0.0009300757509790026, + "loss": 0.86128479, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.17712402, + "step": 1015, + "time_per_iteration": 2.9023685455322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179228, + "balance_loss_mlp": 1.16137052, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.1305336983537898, + "language_loss": 0.90272522, + "learning_rate": 0.0009299167688944005, + "loss": 0.91451752, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.17883301, + "step": 1016, + "time_per_iteration": 2.5396370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180846, + "balance_loss_mlp": 1.16236818, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.10642959866559894, + "language_loss": 0.85698497, + "learning_rate": 0.0009297576199013063, + "loss": 0.86879343, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.18457031, + "step": 1017, + "time_per_iteration": 2.7503206729888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151973, + "balance_loss_mlp": 1.13890779, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.05607404145793752, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74154103, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.13085938, + "step": 1018, + "time_per_iteration": 4.931609153747559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106819, + "balance_loss_mlp": 1.09365869, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.04672191734885249, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80533117, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.13183594, + "step": 1019, + "time_per_iteration": 5.336720705032349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228797, + "balance_loss_mlp": 1.21011734, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.07997087287444872, + "language_loss": 0.86300683, + "learning_rate": 0.0009292791720892659, + "loss": 0.8752948, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.18664551, + "step": 1020, + "time_per_iteration": 2.8861892223358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221818, + "balance_loss_mlp": 1.20275593, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.08883950328468299, + "language_loss": 0.88082206, + "learning_rate": 0.0009291193560807218, + "loss": 0.89304024, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.19055176, + "step": 1021, + "time_per_iteration": 2.6382570266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209623, + "balance_loss_mlp": 1.19078755, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.07890952504822618, + "language_loss": 0.86793423, + "learning_rate": 0.0009289593734732688, + "loss": 0.88003045, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.18811035, + "step": 1022, + "time_per_iteration": 2.6261141300201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185276, + "balance_loss_mlp": 1.16670358, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.0835325264325779, + "language_loss": 0.93570763, + "learning_rate": 0.0009287992243290175, + "loss": 0.94756043, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.18579102, + "step": 1023, + "time_per_iteration": 2.515672445297241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161758, + "balance_loss_mlp": 1.14213622, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.07747777445836627, + "language_loss": 0.9021076, + "learning_rate": 0.0009286389087101435, + "loss": 0.9137252, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.19604492, + "step": 1024, + "time_per_iteration": 2.8165409564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144138, + "balance_loss_mlp": 1.12458754, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.093529298896635, + "language_loss": 0.88402045, + "learning_rate": 0.0009284784266788864, + "loss": 0.8954618, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.19543457, + "step": 1025, + "time_per_iteration": 2.746727705001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143253, + "balance_loss_mlp": 1.12456095, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.07377516343298976, + "language_loss": 0.92142463, + "learning_rate": 0.0009283177782975512, + "loss": 0.9328571, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.18688965, + "step": 1026, + "time_per_iteration": 3.0783705711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125598, + "balance_loss_mlp": 1.1064887, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.09283572483169282, + "language_loss": 0.87607288, + "learning_rate": 0.000928156963628507, + "loss": 0.8873288, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.19116211, + "step": 1027, + "time_per_iteration": 2.6074790954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119339, + "balance_loss_mlp": 1.09947884, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.17318139898935403, + "language_loss": 0.87847698, + "learning_rate": 0.0009279959827341877, + "loss": 0.88967031, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.1986084, + "step": 1028, + "time_per_iteration": 2.786883592605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122475, + "balance_loss_mlp": 1.10186362, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.09725837933244906, + "language_loss": 0.87463772, + "learning_rate": 0.0009278348356770915, + "loss": 0.88586247, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.20617676, + "step": 1029, + "time_per_iteration": 2.6152124404907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115903, + "balance_loss_mlp": 1.09576869, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.09726879406227856, + "language_loss": 0.85104239, + "learning_rate": 0.0009276735225197814, + "loss": 0.86220145, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.20129395, + "step": 1030, + "time_per_iteration": 2.6491973400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140863, + "balance_loss_mlp": 1.12079978, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.07981294302307375, + "language_loss": 0.85465813, + "learning_rate": 0.0009275120433248847, + "loss": 0.86606669, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.20056152, + "step": 1031, + "time_per_iteration": 2.7181904315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170044, + "balance_loss_mlp": 1.14986157, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.08870117223998657, + "language_loss": 0.85574758, + "learning_rate": 0.0009273503981550931, + "loss": 0.86744803, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.20178223, + "step": 1032, + "time_per_iteration": 3.15751576423645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210574, + "balance_loss_mlp": 1.19066548, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.10622365116136065, + "language_loss": 0.86958814, + "learning_rate": 0.0009271885870731626, + "loss": 0.88169384, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.19909668, + "step": 1033, + "time_per_iteration": 2.513871431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124371, + "balance_loss_mlp": 1.22355127, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.12163862472720371, + "language_loss": 0.88120484, + "learning_rate": 0.0009270266101419143, + "loss": 0.89364195, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.20153809, + "step": 1034, + "time_per_iteration": 2.6154308319091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233971, + "balance_loss_mlp": 1.21453989, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.13626001105869123, + "language_loss": 0.84950191, + "learning_rate": 0.0009268644674242328, + "loss": 0.86184162, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.19433594, + "step": 1035, + "time_per_iteration": 2.706982135772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220957, + "balance_loss_mlp": 1.20152593, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09310216058180905, + "language_loss": 0.80796313, + "learning_rate": 0.0009267021589830678, + "loss": 0.82017273, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.19421387, + "step": 1036, + "time_per_iteration": 2.641144275665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300787, + "balance_loss_mlp": 1.28457427, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.08257719551105532, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78927869, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 5.017476558685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198691, + "balance_loss_mlp": 1.17903364, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08600893320147879, + "language_loss": 0.92715919, + "learning_rate": 0.000926377045182406, + "loss": 0.93914616, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.19641113, + "step": 1038, + "time_per_iteration": 2.939668893814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215159, + "balance_loss_mlp": 1.19595408, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.24386100452943713, + "language_loss": 0.87511599, + "learning_rate": 0.0009262142399491296, + "loss": 0.88726759, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.19189453, + "step": 1039, + "time_per_iteration": 3.0862977504730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248948, + "balance_loss_mlp": 1.22932601, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09408226392225982, + "language_loss": 0.87996912, + "learning_rate": 0.0009260512692448105, + "loss": 0.89245868, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.19604492, + "step": 1040, + "time_per_iteration": 2.711160182952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288003, + "balance_loss_mlp": 1.26749945, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.13301921079143278, + "language_loss": 0.84115559, + "learning_rate": 0.000925888133132719, + "loss": 0.85403562, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.20507812, + "step": 1041, + "time_per_iteration": 2.740140199661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166251, + "balance_loss_mlp": 1.1515646, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.059408002972858115, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8077668, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14648438, + "step": 1042, + "time_per_iteration": 4.983680009841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318672, + "balance_loss_mlp": 1.29690433, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.1163225797864763, + "language_loss": 0.81054026, + "learning_rate": 0.0009255613649386244, + "loss": 0.82372701, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.21777344, + "step": 1043, + "time_per_iteration": 2.6790683269500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300915, + "balance_loss_mlp": 1.27936232, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.10848871275509671, + "language_loss": 0.78969169, + "learning_rate": 0.0009253977329834838, + "loss": 0.80270082, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.21569824, + "step": 1044, + "time_per_iteration": 2.6970701217651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286746, + "balance_loss_mlp": 1.26458514, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.09565462118383694, + "language_loss": 0.86161876, + "learning_rate": 0.0009252339358742965, + "loss": 0.87448621, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.22167969, + "step": 1045, + "time_per_iteration": 2.87453556060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129502, + "balance_loss_mlp": 1.2733593, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.10796199739740596, + "language_loss": 0.83195245, + "learning_rate": 0.000925069973674654, + "loss": 0.84490263, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.21679688, + "step": 1046, + "time_per_iteration": 2.6612823009490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275399, + "balance_loss_mlp": 1.25408411, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.06722367899146847, + "language_loss": 0.88250053, + "learning_rate": 0.000924905846448212, + "loss": 0.89525455, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.21325684, + "step": 1047, + "time_per_iteration": 2.730875015258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292917, + "balance_loss_mlp": 1.27123272, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.09038052031526789, + "language_loss": 0.85797572, + "learning_rate": 0.0009247415542586906, + "loss": 0.87090492, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.21691895, + "step": 1048, + "time_per_iteration": 2.8412506580352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248583, + "balance_loss_mlp": 1.22672033, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.08064336148566398, + "language_loss": 0.83021247, + "learning_rate": 0.0009245770971698735, + "loss": 0.84269828, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.21875, + "step": 1049, + "time_per_iteration": 4.440186023712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237632, + "balance_loss_mlp": 1.21671033, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.08794152426297831, + "language_loss": 0.88490599, + "learning_rate": 0.0009244124752456087, + "loss": 0.89728236, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.20922852, + "step": 1050, + "time_per_iteration": 2.529827833175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224991, + "balance_loss_mlp": 1.20434391, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.07833105787078826, + "language_loss": 0.85121548, + "learning_rate": 0.0009242476885498081, + "loss": 0.86346543, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.20654297, + "step": 1051, + "time_per_iteration": 2.7487235069274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201681, + "balance_loss_mlp": 1.18077159, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.09537947845979083, + "language_loss": 0.80832058, + "learning_rate": 0.0009240827371464474, + "loss": 0.82033736, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.20922852, + "step": 1052, + "time_per_iteration": 2.570289373397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190217, + "balance_loss_mlp": 1.16978419, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.0749559041873476, + "language_loss": 0.83869404, + "learning_rate": 0.0009239176210995666, + "loss": 0.85059625, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.2043457, + "step": 1053, + "time_per_iteration": 3.48331880569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164732, + "balance_loss_mlp": 1.14463329, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.08759256892165929, + "language_loss": 0.9366219, + "learning_rate": 0.0009237523404732695, + "loss": 0.94826925, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.2010498, + "step": 1054, + "time_per_iteration": 2.8900768756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152229, + "balance_loss_mlp": 1.13102162, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.08554891996887364, + "language_loss": 0.84106672, + "learning_rate": 0.0009235868953317235, + "loss": 0.85258889, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.21191406, + "step": 1055, + "time_per_iteration": 2.805739402770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152429, + "balance_loss_mlp": 1.1321516, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.08283401132516657, + "language_loss": 0.84830916, + "learning_rate": 0.0009234212857391602, + "loss": 0.85983348, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.20275879, + "step": 1056, + "time_per_iteration": 3.2523794174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150661, + "balance_loss_mlp": 1.13000214, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.08956025084292601, + "language_loss": 0.88911903, + "learning_rate": 0.000923255511759875, + "loss": 0.90062559, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.20666504, + "step": 1057, + "time_per_iteration": 2.7904763221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144001, + "balance_loss_mlp": 1.12379456, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.0943960049444156, + "language_loss": 0.84853089, + "learning_rate": 0.000923089573458227, + "loss": 0.85997093, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.2019043, + "step": 1058, + "time_per_iteration": 2.8817007541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152473, + "balance_loss_mlp": 1.13152814, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.0957717786757319, + "language_loss": 0.83558518, + "learning_rate": 0.0009229234708986392, + "loss": 0.84710991, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.20947266, + "step": 1059, + "time_per_iteration": 2.9059059619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179467, + "balance_loss_mlp": 1.1632545, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.05660436116329576, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82846367, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.16210938, + "step": 1060, + "time_per_iteration": 4.709235429763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158087, + "balance_loss_mlp": 1.13642621, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.07273861691254356, + "language_loss": 0.84919071, + "learning_rate": 0.0009225907732636548, + "loss": 0.86077166, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.2166748, + "step": 1061, + "time_per_iteration": 2.7832870483398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170458, + "balance_loss_mlp": 1.14922678, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.10826308082162117, + "language_loss": 0.86149454, + "learning_rate": 0.0009224241783174227, + "loss": 0.87319911, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.21252441, + "step": 1062, + "time_per_iteration": 2.7493624687194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116711, + "balance_loss_mlp": 1.14574718, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.0807963285895634, + "language_loss": 0.85689318, + "learning_rate": 0.0009222574193715802, + "loss": 0.86856437, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.21374512, + "step": 1063, + "time_per_iteration": 2.8018240928649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159842, + "balance_loss_mlp": 1.13889694, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.07340119955686962, + "language_loss": 0.85735941, + "learning_rate": 0.000922090496490869, + "loss": 0.86895782, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.20947266, + "step": 1064, + "time_per_iteration": 2.765749931335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152698, + "balance_loss_mlp": 1.13164544, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.07242145518951734, + "language_loss": 0.89867234, + "learning_rate": 0.0009219234097400937, + "loss": 0.9101994, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.21057129, + "step": 1065, + "time_per_iteration": 2.8627817630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114786, + "balance_loss_mlp": 1.12674773, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.08464925787423999, + "language_loss": 0.83060288, + "learning_rate": 0.0009217561591841237, + "loss": 0.84208149, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.21130371, + "step": 1066, + "time_per_iteration": 3.3423283100128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142136, + "balance_loss_mlp": 1.12129867, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.08558035413219019, + "language_loss": 0.80671912, + "learning_rate": 0.0009215887448878913, + "loss": 0.81814051, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.20849609, + "step": 1067, + "time_per_iteration": 2.5908420085906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133301, + "balance_loss_mlp": 1.11204648, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.08226430294551884, + "language_loss": 0.8469618, + "learning_rate": 0.0009214211669163922, + "loss": 0.85829484, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.21264648, + "step": 1068, + "time_per_iteration": 2.70798397064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136986, + "balance_loss_mlp": 1.11625564, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.08433693913464968, + "language_loss": 0.9379245, + "learning_rate": 0.0009212534253346862, + "loss": 0.94929433, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.20727539, + "step": 1069, + "time_per_iteration": 2.7776713371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129003, + "balance_loss_mlp": 1.10772455, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.09450930819857521, + "language_loss": 0.8384515, + "learning_rate": 0.0009210855202078964, + "loss": 0.84974158, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.2130127, + "step": 1070, + "time_per_iteration": 2.6283328533172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130904, + "balance_loss_mlp": 1.11017382, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.08132695111234396, + "language_loss": 0.86854172, + "learning_rate": 0.0009209174516012091, + "loss": 0.87985075, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.20751953, + "step": 1071, + "time_per_iteration": 2.535447120666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133709, + "balance_loss_mlp": 1.11270416, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.11111326067788187, + "language_loss": 0.88662505, + "learning_rate": 0.0009207492195798747, + "loss": 0.89796209, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.21008301, + "step": 1072, + "time_per_iteration": 2.7883682250976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144236, + "balance_loss_mlp": 1.12275457, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.10819626667436329, + "language_loss": 0.84654653, + "learning_rate": 0.0009205808242092061, + "loss": 0.85798889, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.21484375, + "step": 1073, + "time_per_iteration": 2.6761436462402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166479, + "balance_loss_mlp": 1.1445806, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.10070475961417262, + "language_loss": 0.82806575, + "learning_rate": 0.0009204122655545808, + "loss": 0.8397305, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.21911621, + "step": 1074, + "time_per_iteration": 3.326646089553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169219, + "balance_loss_mlp": 1.14714098, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.07526963641038939, + "language_loss": 0.80370897, + "learning_rate": 0.0009202435436814388, + "loss": 0.8154012, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.22070312, + "step": 1075, + "time_per_iteration": 2.718374013900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117617, + "balance_loss_mlp": 1.15484309, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.08141199692544657, + "language_loss": 0.89125872, + "learning_rate": 0.0009200746586552836, + "loss": 0.90302044, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.21350098, + "step": 1076, + "time_per_iteration": 2.9237890243530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116406, + "balance_loss_mlp": 1.14320993, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.08915437819246362, + "language_loss": 0.83578765, + "learning_rate": 0.0009199056105416825, + "loss": 0.8474282, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.20861816, + "step": 1077, + "time_per_iteration": 3.1017873287200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174288, + "balance_loss_mlp": 1.15383148, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.08235458210831342, + "language_loss": 0.8621031, + "learning_rate": 0.0009197363994062654, + "loss": 0.87384599, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.20458984, + "step": 1078, + "time_per_iteration": 2.832416296005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115862, + "balance_loss_mlp": 1.13828301, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.14524602294533026, + "language_loss": 0.8378703, + "learning_rate": 0.0009195670253147262, + "loss": 0.84945655, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.20336914, + "step": 1079, + "time_per_iteration": 2.9912445545196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130057, + "balance_loss_mlp": 1.11056602, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.07398728313760368, + "language_loss": 0.81629539, + "learning_rate": 0.0009193974883328216, + "loss": 0.82759595, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.19470215, + "step": 1080, + "time_per_iteration": 2.636516809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.12286365, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.08145379169955597, + "language_loss": 0.86828917, + "learning_rate": 0.0009192277885263718, + "loss": 0.87971467, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.19665527, + "step": 1081, + "time_per_iteration": 2.7361197471618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137254, + "balance_loss_mlp": 1.11765575, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.09498097190043973, + "language_loss": 0.85732365, + "learning_rate": 0.0009190579259612602, + "loss": 0.86869615, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.19580078, + "step": 1082, + "time_per_iteration": 3.3791959285736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156614, + "balance_loss_mlp": 1.13621759, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.1488703614850634, + "language_loss": 0.86399055, + "learning_rate": 0.000918887900703433, + "loss": 0.87555665, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.20397949, + "step": 1083, + "time_per_iteration": 2.8133795261383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.129125, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.0859641513447352, + "language_loss": 0.90200919, + "learning_rate": 0.0009187177128188999, + "loss": 0.91349459, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.19396973, + "step": 1084, + "time_per_iteration": 2.4999842643737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286106, + "balance_loss_mlp": 1.27151525, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.08105811039849961, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78442645, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.14550781, + "step": 1085, + "time_per_iteration": 4.8958563804626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153669, + "balance_loss_mlp": 1.13441706, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.08197687066157772, + "language_loss": 0.85811758, + "learning_rate": 0.000918376849434071, + "loss": 0.86965424, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.19250488, + "step": 1086, + "time_per_iteration": 2.5344736576080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118214, + "balance_loss_mlp": 1.16158867, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.10825532619194118, + "language_loss": 0.90649915, + "learning_rate": 0.0009182061740661098, + "loss": 0.9183206, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.20556641, + "step": 1087, + "time_per_iteration": 2.5707151889801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178442, + "balance_loss_mlp": 1.15811718, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.08160475290131898, + "language_loss": 0.84683895, + "learning_rate": 0.0009180353363361127, + "loss": 0.85862345, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.203125, + "step": 1088, + "time_per_iteration": 3.137329339981079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174451, + "balance_loss_mlp": 1.15374422, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.10140667942926032, + "language_loss": 0.81920874, + "learning_rate": 0.0009178643363104044, + "loss": 0.83095324, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.20715332, + "step": 1089, + "time_per_iteration": 3.1493358612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147137, + "balance_loss_mlp": 1.12660897, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.10442412310556573, + "language_loss": 0.90355861, + "learning_rate": 0.0009176931740553735, + "loss": 0.91503, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.20532227, + "step": 1090, + "time_per_iteration": 2.5557990074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139677, + "balance_loss_mlp": 1.11933959, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.17656839042402708, + "language_loss": 0.82232946, + "learning_rate": 0.0009175218496374708, + "loss": 0.83372623, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.20349121, + "step": 1091, + "time_per_iteration": 3.3492214679718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132775, + "balance_loss_mlp": 1.11287904, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.09269359078641065, + "language_loss": 0.85681468, + "learning_rate": 0.0009173503631232103, + "loss": 0.86814249, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.19885254, + "step": 1092, + "time_per_iteration": 3.396247386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.11091864, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.09283462310009857, + "language_loss": 0.81684232, + "learning_rate": 0.0009171787145791691, + "loss": 0.82815444, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.20288086, + "step": 1093, + "time_per_iteration": 3.2441000938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.11279404, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.14183927725725606, + "language_loss": 0.79456544, + "learning_rate": 0.000917006904071987, + "loss": 0.80589247, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.19897461, + "step": 1094, + "time_per_iteration": 2.658992052078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140578, + "balance_loss_mlp": 1.12040734, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.07963562881698232, + "language_loss": 0.86590552, + "learning_rate": 0.0009168349316683669, + "loss": 0.87731135, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.20166016, + "step": 1095, + "time_per_iteration": 2.7208545207977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157939, + "balance_loss_mlp": 1.1382103, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.06948100196361624, + "language_loss": 0.82885933, + "learning_rate": 0.0009166627974350741, + "loss": 0.84043866, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.19714355, + "step": 1096, + "time_per_iteration": 2.879690647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158751, + "balance_loss_mlp": 1.13850892, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.07894738519235364, + "language_loss": 0.89620626, + "learning_rate": 0.0009164905014389373, + "loss": 0.90779376, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.20239258, + "step": 1097, + "time_per_iteration": 2.7915890216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174722, + "balance_loss_mlp": 1.15442061, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.08089010798718275, + "language_loss": 0.86655492, + "learning_rate": 0.0009163180437468476, + "loss": 0.87830216, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.20300293, + "step": 1098, + "time_per_iteration": 2.671910285949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160878, + "balance_loss_mlp": 1.14083886, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.1273171739233691, + "language_loss": 0.85848475, + "learning_rate": 0.000916145424425759, + "loss": 0.87009346, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.20031738, + "step": 1099, + "time_per_iteration": 2.718719959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138682, + "balance_loss_mlp": 1.11927521, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.11827895321179892, + "language_loss": 0.90551817, + "learning_rate": 0.0009159726435426885, + "loss": 0.91690505, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.19384766, + "step": 1100, + "time_per_iteration": 4.622005939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096537, + "balance_loss_mlp": 1.07577038, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.08009025902543959, + "language_loss": 0.90283167, + "learning_rate": 0.0009157997011647154, + "loss": 0.91379714, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.2076416, + "step": 1101, + "time_per_iteration": 2.605741262435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.0622586, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.10006580652166666, + "language_loss": 0.85976642, + "learning_rate": 0.0009156265973589817, + "loss": 0.87059283, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.20385742, + "step": 1102, + "time_per_iteration": 2.7997629642486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082906, + "balance_loss_mlp": 1.06256843, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.08882618780300273, + "language_loss": 0.89710194, + "learning_rate": 0.0009154533321926926, + "loss": 0.90793097, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.20336914, + "step": 1103, + "time_per_iteration": 2.6505167484283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082845, + "balance_loss_mlp": 1.06240106, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.08104008133152642, + "language_loss": 0.87105876, + "learning_rate": 0.0009152799057331156, + "loss": 0.88188726, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.20446777, + "step": 1104, + "time_per_iteration": 3.16381573677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_mlp": 1.06503153, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.1303184369793021, + "language_loss": 0.90978825, + "learning_rate": 0.0009151063180475805, + "loss": 0.92063844, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.1998291, + "step": 1105, + "time_per_iteration": 2.519392490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081303, + "balance_loss_mlp": 1.06139469, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.09253503988008102, + "language_loss": 0.84230483, + "learning_rate": 0.0009149325692034803, + "loss": 0.85311788, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.19897461, + "step": 1106, + "time_per_iteration": 2.623030662536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122847, + "balance_loss_mlp": 1.11054456, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.03239256029122438, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80326271, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.12304688, + "step": 1107, + "time_per_iteration": 4.865934610366821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095405, + "balance_loss_mlp": 1.07612848, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.08663251382833077, + "language_loss": 0.87545854, + "learning_rate": 0.0009145845883094678, + "loss": 0.88641262, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.19262695, + "step": 1108, + "time_per_iteration": 3.0644633769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106513, + "balance_loss_mlp": 1.08767843, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.09154471330204571, + "language_loss": 0.84864843, + "learning_rate": 0.000914410356394654, + "loss": 0.85971349, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.18798828, + "step": 1109, + "time_per_iteration": 2.7818005084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111664, + "balance_loss_mlp": 1.09850883, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.05901208331379503, + "language_loss": 0.84397328, + "learning_rate": 0.0009142359635914709, + "loss": 0.85513967, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.18151855, + "step": 1110, + "time_per_iteration": 3.0699398517608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132455, + "balance_loss_mlp": 1.11437058, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.07045633933043649, + "language_loss": 0.84396905, + "learning_rate": 0.0009140614099676245, + "loss": 0.85529351, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.18103027, + "step": 1111, + "time_per_iteration": 2.6896469593048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144685, + "balance_loss_mlp": 1.12654102, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.07609754946919366, + "language_loss": 0.82333195, + "learning_rate": 0.0009138866955908821, + "loss": 0.83477879, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.18151855, + "step": 1112, + "time_per_iteration": 2.9167656898498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173372, + "balance_loss_mlp": 1.15541935, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.07536024812721688, + "language_loss": 0.80650687, + "learning_rate": 0.0009137118205290738, + "loss": 0.81824064, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.17956543, + "step": 1113, + "time_per_iteration": 3.038858652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173379, + "balance_loss_mlp": 1.15471053, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.08578166607433227, + "language_loss": 0.9008798, + "learning_rate": 0.0009135367848500924, + "loss": 0.91261363, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.18652344, + "step": 1114, + "time_per_iteration": 2.5301332473754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183524, + "balance_loss_mlp": 1.16561842, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.097679735811004, + "language_loss": 0.86396897, + "learning_rate": 0.0009133615886218927, + "loss": 0.87580419, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.17932129, + "step": 1115, + "time_per_iteration": 2.7787675857543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181259, + "balance_loss_mlp": 1.16279316, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.08896664083513224, + "language_loss": 0.87571919, + "learning_rate": 0.0009131862319124917, + "loss": 0.88753176, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.18469238, + "step": 1116, + "time_per_iteration": 2.7031164169311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177922, + "balance_loss_mlp": 1.15970659, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.07771910148821705, + "language_loss": 0.8379603, + "learning_rate": 0.0009130107147899691, + "loss": 0.84973955, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.18237305, + "step": 1117, + "time_per_iteration": 2.7842912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180049, + "balance_loss_mlp": 1.16186976, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.07252648730513606, + "language_loss": 0.85351467, + "learning_rate": 0.0009128350373224665, + "loss": 0.86531514, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.1817627, + "step": 1118, + "time_per_iteration": 2.547067880630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174302, + "balance_loss_mlp": 1.1582799, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.06807222888709992, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.8263073, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.16015625, + "step": 1119, + "time_per_iteration": 4.686914443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191791, + "balance_loss_mlp": 1.1730994, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07584418562153701, + "language_loss": 0.85298818, + "learning_rate": 0.0009124832016254005, + "loss": 0.86490607, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.18676758, + "step": 1120, + "time_per_iteration": 2.594407558441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.16062903, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.07950209413805702, + "language_loss": 0.87972558, + "learning_rate": 0.0009123070435324316, + "loss": 0.89152032, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.18835449, + "step": 1121, + "time_per_iteration": 2.8215177059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068291, + "balance_loss_mlp": 1.05379486, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.028005803680130233, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78944069, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.14453125, + "step": 1122, + "time_per_iteration": 5.0041632652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159249, + "balance_loss_mlp": 1.14079511, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.08251943361984397, + "language_loss": 0.86073762, + "learning_rate": 0.0009119542471995752, + "loss": 0.87233007, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.18432617, + "step": 1123, + "time_per_iteration": 2.862286329269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163328, + "balance_loss_mlp": 1.14537501, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.09258223897772182, + "language_loss": 0.81420332, + "learning_rate": 0.0009117776090966554, + "loss": 0.8258366, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.17956543, + "step": 1124, + "time_per_iteration": 2.957061767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178568, + "balance_loss_mlp": 1.15982795, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.08713542738122697, + "language_loss": 0.86376691, + "learning_rate": 0.0009116008111274899, + "loss": 0.87555259, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.18725586, + "step": 1125, + "time_per_iteration": 3.2553656101226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134598, + "balance_loss_mlp": 1.12191415, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.04404830998294008, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80241525, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.12695312, + "step": 1126, + "time_per_iteration": 4.808468818664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178721, + "balance_loss_mlp": 1.16074455, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.11245559393918578, + "language_loss": 0.8463136, + "learning_rate": 0.0009112467358650396, + "loss": 0.85810077, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.17993164, + "step": 1127, + "time_per_iteration": 3.2135119438171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203773, + "balance_loss_mlp": 1.18573689, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.12344639465473216, + "language_loss": 0.86497682, + "learning_rate": 0.0009110694587092192, + "loss": 0.87701452, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.18041992, + "step": 1128, + "time_per_iteration": 2.76655650138855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187728, + "balance_loss_mlp": 1.17007267, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.08979647183610162, + "language_loss": 0.81230694, + "learning_rate": 0.0009108920219620815, + "loss": 0.82418424, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.17675781, + "step": 1129, + "time_per_iteration": 2.654778242111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213499, + "balance_loss_mlp": 1.19534314, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.09421163362280094, + "language_loss": 0.89139944, + "learning_rate": 0.0009107144256925133, + "loss": 0.90353441, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.18164062, + "step": 1130, + "time_per_iteration": 2.6828513145446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118696, + "balance_loss_mlp": 1.1690309, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.10043397732842237, + "language_loss": 0.82135975, + "learning_rate": 0.0009105366699694638, + "loss": 0.83322936, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.17944336, + "step": 1131, + "time_per_iteration": 2.7368264198303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156124, + "balance_loss_mlp": 1.13807523, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.06866995192565088, + "language_loss": 0.8126269, + "learning_rate": 0.0009103587548619439, + "loss": 0.82418817, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.18066406, + "step": 1132, + "time_per_iteration": 2.8550221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127864, + "balance_loss_mlp": 1.10951805, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.07626365128544196, + "language_loss": 0.85966831, + "learning_rate": 0.0009101806804390261, + "loss": 0.87094694, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.18359375, + "step": 1133, + "time_per_iteration": 2.865067720413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104426, + "balance_loss_mlp": 1.08616304, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.0835029551695644, + "language_loss": 0.89787459, + "learning_rate": 0.0009100024467698453, + "loss": 0.90891886, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.18261719, + "step": 1134, + "time_per_iteration": 2.587308645248413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107106, + "balance_loss_mlp": 1.08858073, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.1261525750794289, + "language_loss": 0.8228271, + "learning_rate": 0.0009098240539235981, + "loss": 0.83389813, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.1854248, + "step": 1135, + "time_per_iteration": 2.672178268432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118389, + "balance_loss_mlp": 1.10042465, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.07190677595982913, + "language_loss": 0.87357873, + "learning_rate": 0.0009096455019695423, + "loss": 0.88476264, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.17980957, + "step": 1136, + "time_per_iteration": 2.7987098693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132882, + "balance_loss_mlp": 1.1147505, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.07940180090442328, + "language_loss": 0.89624888, + "learning_rate": 0.000909466790976998, + "loss": 0.90757769, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.18139648, + "step": 1137, + "time_per_iteration": 2.477332830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135864, + "balance_loss_mlp": 1.11760151, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.0834179991172278, + "language_loss": 0.82063508, + "learning_rate": 0.0009092879210153473, + "loss": 0.83199376, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.18261719, + "step": 1138, + "time_per_iteration": 3.12052321434021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144384, + "balance_loss_mlp": 1.12646723, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.08144398942367967, + "language_loss": 0.88541782, + "learning_rate": 0.0009091088921540333, + "loss": 0.89686167, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.17919922, + "step": 1139, + "time_per_iteration": 2.616718292236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059921, + "balance_loss_mlp": 1.04833436, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.03144960121690337, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76568598, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.11572266, + "step": 1140, + "time_per_iteration": 4.950219392776489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159199, + "balance_loss_mlp": 1.14099586, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.08175747516698374, + "language_loss": 0.84013134, + "learning_rate": 0.0009087503580104985, + "loss": 0.85172331, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.18212891, + "step": 1141, + "time_per_iteration": 2.7156832218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169076, + "balance_loss_mlp": 1.15111113, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.09158845445189351, + "language_loss": 0.7908268, + "learning_rate": 0.0009085708528674728, + "loss": 0.80251753, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.17993164, + "step": 1142, + "time_per_iteration": 2.7931153774261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164556, + "balance_loss_mlp": 1.14653111, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.08286913258708346, + "language_loss": 0.86118239, + "learning_rate": 0.0009083911891031745, + "loss": 0.87282795, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.18041992, + "step": 1143, + "time_per_iteration": 3.116783857345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117374, + "balance_loss_mlp": 1.15575087, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.10598120448533326, + "language_loss": 0.91152728, + "learning_rate": 0.0009082113667873553, + "loss": 0.92326462, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.18005371, + "step": 1144, + "time_per_iteration": 3.1333653926849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165589, + "balance_loss_mlp": 1.14781499, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.09559133609898889, + "language_loss": 0.9010762, + "learning_rate": 0.0009080313859898283, + "loss": 0.91273212, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.17773438, + "step": 1145, + "time_per_iteration": 2.5269837379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158069, + "balance_loss_mlp": 1.13981819, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.08379728657337264, + "language_loss": 0.91627228, + "learning_rate": 0.0009078512467804684, + "loss": 0.92785299, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.18249512, + "step": 1146, + "time_per_iteration": 2.6481103897094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13930488, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.08494148813195015, + "language_loss": 0.90029317, + "learning_rate": 0.0009076709492292119, + "loss": 0.91186154, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.17541504, + "step": 1147, + "time_per_iteration": 2.659444808959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156044, + "balance_loss_mlp": 1.1380074, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.08635236800942281, + "language_loss": 0.88836294, + "learning_rate": 0.0009074904934060562, + "loss": 0.89992332, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.18041992, + "step": 1148, + "time_per_iteration": 2.6803669929504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154403, + "balance_loss_mlp": 1.13666439, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.0889091403520225, + "language_loss": 0.84333098, + "learning_rate": 0.0009073098793810607, + "loss": 0.85487497, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.17749023, + "step": 1149, + "time_per_iteration": 2.9655888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142518, + "balance_loss_mlp": 1.12488723, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.1004212836055253, + "language_loss": 0.88171208, + "learning_rate": 0.000907129107224346, + "loss": 0.89313722, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.17651367, + "step": 1150, + "time_per_iteration": 2.7072501182556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114998, + "balance_loss_mlp": 1.13255119, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.06570196764831916, + "language_loss": 0.88176614, + "learning_rate": 0.0009069481770060939, + "loss": 0.8932659, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.17443848, + "step": 1151, + "time_per_iteration": 2.685103178024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154372, + "balance_loss_mlp": 1.13708711, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.09650141097201487, + "language_loss": 0.83268076, + "learning_rate": 0.000906767088796548, + "loss": 0.84422451, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.17297363, + "step": 1152, + "time_per_iteration": 3.4740118980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116577, + "balance_loss_mlp": 1.14875841, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.08954893541671843, + "language_loss": 0.86883795, + "learning_rate": 0.0009065858426660127, + "loss": 0.88049567, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.17028809, + "step": 1153, + "time_per_iteration": 2.6959545612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162713, + "balance_loss_mlp": 1.14552331, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.08642937771359972, + "language_loss": 0.84477949, + "learning_rate": 0.0009064044386848543, + "loss": 0.85640663, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.17199707, + "step": 1154, + "time_per_iteration": 2.9327309131622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148113, + "balance_loss_mlp": 1.13044643, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.10097530204718137, + "language_loss": 0.8819679, + "learning_rate": 0.0009062228769234997, + "loss": 0.89344907, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.17675781, + "step": 1155, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131691, + "balance_loss_mlp": 1.11384535, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.08570412042921306, + "language_loss": 0.80458236, + "learning_rate": 0.0009060411574524376, + "loss": 0.81589925, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.17858887, + "step": 1156, + "time_per_iteration": 2.6829988956451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121054, + "balance_loss_mlp": 1.10336328, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.09330017299295373, + "language_loss": 0.87879562, + "learning_rate": 0.0009058592803422178, + "loss": 0.89000618, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.17712402, + "step": 1157, + "time_per_iteration": 3.181018829345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121897, + "balance_loss_mlp": 1.10911822, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.048914379983556036, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79832184, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.12792969, + "step": 1158, + "time_per_iteration": 4.887088775634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115665, + "balance_loss_mlp": 1.0982244, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.0696072904806853, + "language_loss": 0.89700031, + "learning_rate": 0.00090549505348681, + "loss": 0.90815699, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.17456055, + "step": 1159, + "time_per_iteration": 2.598071813583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112274, + "balance_loss_mlp": 1.09486985, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.12380497141241992, + "language_loss": 0.83892691, + "learning_rate": 0.0009053127038830275, + "loss": 0.85004961, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.17407227, + "step": 1160, + "time_per_iteration": 2.972153663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105235, + "balance_loss_mlp": 1.08817601, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.11211348915152936, + "language_loss": 0.86961317, + "learning_rate": 0.000905130196922898, + "loss": 0.88066548, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.17077637, + "step": 1161, + "time_per_iteration": 2.586404800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103766, + "balance_loss_mlp": 1.08674335, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.08844003676149725, + "language_loss": 0.8712495, + "learning_rate": 0.0009049475326772769, + "loss": 0.88228714, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.17028809, + "step": 1162, + "time_per_iteration": 2.633775472640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115059, + "balance_loss_mlp": 1.09810734, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.08335674073816261, + "language_loss": 0.83002663, + "learning_rate": 0.0009047647112170811, + "loss": 0.84117723, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.16967773, + "step": 1163, + "time_per_iteration": 2.779890537261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112691, + "balance_loss_mlp": 1.11049509, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.19679577404354898, + "language_loss": 0.87137246, + "learning_rate": 0.0009045817326132876, + "loss": 0.88264161, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.16418457, + "step": 1164, + "time_per_iteration": 3.703150749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153627, + "balance_loss_mlp": 1.13630629, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.08115041291567808, + "language_loss": 0.83409214, + "learning_rate": 0.0009043985969369357, + "loss": 0.84562844, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.17333984, + "step": 1165, + "time_per_iteration": 2.8744845390319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175693, + "balance_loss_mlp": 1.15849137, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.06201627876445988, + "language_loss": 0.84104788, + "learning_rate": 0.0009042153042591245, + "loss": 0.85280478, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.17224121, + "step": 1166, + "time_per_iteration": 2.8617310523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184386, + "balance_loss_mlp": 1.16719604, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.08223980595448348, + "language_loss": 0.84917307, + "learning_rate": 0.0009040318546510146, + "loss": 0.86101699, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.17211914, + "step": 1167, + "time_per_iteration": 3.1852662563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184421, + "balance_loss_mlp": 1.16730213, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.0789242941151387, + "language_loss": 0.85142338, + "learning_rate": 0.0009038482481838275, + "loss": 0.86326754, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.17126465, + "step": 1168, + "time_per_iteration": 2.69252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179663, + "balance_loss_mlp": 1.16241312, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.05697426763288438, + "language_loss": 0.86826229, + "learning_rate": 0.0009036644849288455, + "loss": 0.88005894, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.17260742, + "step": 1169, + "time_per_iteration": 3.1488285064697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174012, + "balance_loss_mlp": 1.15652442, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.08495924937221859, + "language_loss": 0.85084724, + "learning_rate": 0.0009034805649574118, + "loss": 0.86258733, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.1751709, + "step": 1170, + "time_per_iteration": 2.685328722000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183548, + "balance_loss_mlp": 1.16578627, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.11014400581169416, + "language_loss": 0.85017669, + "learning_rate": 0.0009032964883409308, + "loss": 0.86201215, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.17785645, + "step": 1171, + "time_per_iteration": 2.879601240158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.10170817, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.052120324196256125, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74164546, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.12255859, + "step": 1172, + "time_per_iteration": 5.038167715072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198228, + "balance_loss_mlp": 1.18021595, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.07370263777730128, + "language_loss": 0.87101096, + "learning_rate": 0.0009029278654587462, + "loss": 0.88299322, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.18017578, + "step": 1173, + "time_per_iteration": 2.627659559249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207558, + "balance_loss_mlp": 1.1888895, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.09375965630696953, + "language_loss": 0.82013619, + "learning_rate": 0.0009027433193361548, + "loss": 0.83221173, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.18652344, + "step": 1174, + "time_per_iteration": 2.8188316822052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191442, + "balance_loss_mlp": 1.17263079, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.09826481383842127, + "language_loss": 0.8677392, + "learning_rate": 0.00090255861685474, + "loss": 0.87965363, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.18798828, + "step": 1175, + "time_per_iteration": 2.7677559852600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187652, + "balance_loss_mlp": 1.16895974, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.09211807586067215, + "language_loss": 0.90504396, + "learning_rate": 0.0009023737580862095, + "loss": 0.91692042, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.18676758, + "step": 1176, + "time_per_iteration": 2.54901123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191354, + "balance_loss_mlp": 1.17276883, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0881916579324479, + "language_loss": 0.83226693, + "learning_rate": 0.0009021887431023321, + "loss": 0.84418046, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.18566895, + "step": 1177, + "time_per_iteration": 2.6121795177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174332, + "balance_loss_mlp": 1.15594959, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.08194623484888001, + "language_loss": 0.87241113, + "learning_rate": 0.0009020035719749369, + "loss": 0.88415444, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.18359375, + "step": 1178, + "time_per_iteration": 2.7401885986328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158606, + "balance_loss_mlp": 1.14040256, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.0813633568079927, + "language_loss": 0.77680194, + "learning_rate": 0.0009018182447759136, + "loss": 0.78838801, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.18212891, + "step": 1179, + "time_per_iteration": 3.0078771114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145135, + "balance_loss_mlp": 1.12688398, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.09172856476896407, + "language_loss": 0.79547179, + "learning_rate": 0.0009016327615772126, + "loss": 0.80692315, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.18249512, + "step": 1180, + "time_per_iteration": 2.956892251968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140365, + "balance_loss_mlp": 1.12199533, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.0875125644607483, + "language_loss": 0.87631428, + "learning_rate": 0.0009014471224508451, + "loss": 0.8877179, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.18359375, + "step": 1181, + "time_per_iteration": 2.6819214820861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140649, + "balance_loss_mlp": 1.12244546, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.12040521041324766, + "language_loss": 0.82781821, + "learning_rate": 0.0009012613274688823, + "loss": 0.8392247, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.18200684, + "step": 1182, + "time_per_iteration": 2.6545872688293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127803, + "balance_loss_mlp": 1.10971928, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.11611648539449336, + "language_loss": 0.87670434, + "learning_rate": 0.0009010753767034565, + "loss": 0.88798231, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.1809082, + "step": 1183, + "time_per_iteration": 2.5755655765533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011456, + "balance_loss_mlp": 1.12726605, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.07779286107938752, + "language_loss": 0.78790247, + "learning_rate": 0.0009008892702267599, + "loss": 0.79935843, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.18347168, + "step": 1184, + "time_per_iteration": 2.9940855503082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145741, + "balance_loss_mlp": 1.12732279, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.09447672073297446, + "language_loss": 0.88500011, + "learning_rate": 0.0009007030081110457, + "loss": 0.89645755, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.18408203, + "step": 1185, + "time_per_iteration": 2.6603288650512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143533, + "balance_loss_mlp": 1.12500811, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.0853307601225198, + "language_loss": 0.84380877, + "learning_rate": 0.000900516590428627, + "loss": 0.85524416, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.18518066, + "step": 1186, + "time_per_iteration": 2.692070484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141181, + "balance_loss_mlp": 1.12318015, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.07243217971015652, + "language_loss": 0.89009422, + "learning_rate": 0.0009003300172518778, + "loss": 0.90150601, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.17980957, + "step": 1187, + "time_per_iteration": 2.7073988914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.11980963, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.08424899879196017, + "language_loss": 0.83985436, + "learning_rate": 0.0009001432886532321, + "loss": 0.85122764, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.17529297, + "step": 1188, + "time_per_iteration": 2.9843039512634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146183, + "balance_loss_mlp": 1.12812281, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.0771143581641096, + "language_loss": 0.8654418, + "learning_rate": 0.0008999564047051843, + "loss": 0.87690365, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.18054199, + "step": 1189, + "time_per_iteration": 2.6047263145446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152979, + "balance_loss_mlp": 1.13572931, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.0974051284777214, + "language_loss": 0.85100305, + "learning_rate": 0.0008997693654802894, + "loss": 0.86253285, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.17272949, + "step": 1190, + "time_per_iteration": 2.6849515438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134691, + "balance_loss_mlp": 1.11709571, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.08474903758704144, + "language_loss": 0.86204302, + "learning_rate": 0.0008995821710511625, + "loss": 0.87338996, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.17602539, + "step": 1191, + "time_per_iteration": 2.742478132247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126818, + "balance_loss_mlp": 1.10922277, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.08571505564163927, + "language_loss": 0.84842807, + "learning_rate": 0.0008993948214904786, + "loss": 0.85969627, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.17602539, + "step": 1192, + "time_per_iteration": 2.6361818313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045247, + "balance_loss_mlp": 1.03237247, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.028329103864080232, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79467458, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.12890625, + "step": 1193, + "time_per_iteration": 4.969930171966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112876, + "balance_loss_mlp": 1.10972273, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.08612491826756107, + "language_loss": 0.78059292, + "learning_rate": 0.0008990196572654427, + "loss": 0.79188055, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.19018555, + "step": 1194, + "time_per_iteration": 2.8844966888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140316, + "balance_loss_mlp": 1.12217188, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.10153558100200434, + "language_loss": 0.87920988, + "learning_rate": 0.0008988318427467426, + "loss": 0.89061302, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.18151855, + "step": 1195, + "time_per_iteration": 2.687624931335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142082, + "balance_loss_mlp": 1.12412882, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.08230259672194101, + "language_loss": 0.86206847, + "learning_rate": 0.0008986438733877887, + "loss": 0.87348932, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.17956543, + "step": 1196, + "time_per_iteration": 3.4957938194274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153338, + "balance_loss_mlp": 1.13559973, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.06895925957333625, + "language_loss": 0.8397938, + "learning_rate": 0.0008984557492615576, + "loss": 0.85132712, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.1776123, + "step": 1197, + "time_per_iteration": 3.004096031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148862, + "balance_loss_mlp": 1.13082576, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.07382939590065767, + "language_loss": 0.89479733, + "learning_rate": 0.0008982674704410854, + "loss": 0.906286, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.18029785, + "step": 1198, + "time_per_iteration": 2.6988983154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115166, + "balance_loss_mlp": 1.13448238, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.0949037059675448, + "language_loss": 0.77658606, + "learning_rate": 0.0008980790369994682, + "loss": 0.78810263, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.17199707, + "step": 1199, + "time_per_iteration": 2.9618003368377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154837, + "balance_loss_mlp": 1.13739705, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.07145246308543461, + "language_loss": 0.87144834, + "learning_rate": 0.000897890449009863, + "loss": 0.88299668, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.17443848, + "step": 1200, + "time_per_iteration": 2.7796213626861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116547, + "balance_loss_mlp": 1.14776802, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.09854596236312584, + "language_loss": 0.89783561, + "learning_rate": 0.0008977017065454853, + "loss": 0.90949035, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.17712402, + "step": 1201, + "time_per_iteration": 2.7383389472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118456, + "balance_loss_mlp": 1.16748941, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.06681897447915772, + "language_loss": 0.79928529, + "learning_rate": 0.0008975128096796121, + "loss": 0.81113094, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.17077637, + "step": 1202, + "time_per_iteration": 2.893461227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174856, + "balance_loss_mlp": 1.15766644, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.09321616984993739, + "language_loss": 0.85471004, + "learning_rate": 0.0008973237584855794, + "loss": 0.86645865, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.17211914, + "step": 1203, + "time_per_iteration": 2.898749589920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174851, + "balance_loss_mlp": 1.15761375, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.08459599639125864, + "language_loss": 0.82237399, + "learning_rate": 0.0008971345530367832, + "loss": 0.83412254, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.17248535, + "step": 1204, + "time_per_iteration": 2.5461792945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169858, + "balance_loss_mlp": 1.15260816, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.08050630983240942, + "language_loss": 0.85032547, + "learning_rate": 0.0008969451934066799, + "loss": 0.86202407, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.17272949, + "step": 1205, + "time_per_iteration": 2.8455100059509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157737, + "balance_loss_mlp": 1.1401062, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.09118158600793376, + "language_loss": 0.79779387, + "learning_rate": 0.0008967556796687854, + "loss": 0.80937129, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.1763916, + "step": 1206, + "time_per_iteration": 2.977187395095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166868, + "balance_loss_mlp": 1.14940381, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.08470401629761377, + "language_loss": 0.83790028, + "learning_rate": 0.0008965660118966752, + "loss": 0.8495689, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.17480469, + "step": 1207, + "time_per_iteration": 2.9695510864257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164887, + "balance_loss_mlp": 1.14745879, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.07067711449707674, + "language_loss": 0.89920551, + "learning_rate": 0.0008963761901639851, + "loss": 0.9108544, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.17443848, + "step": 1208, + "time_per_iteration": 2.8432528972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164904, + "balance_loss_mlp": 1.14763093, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.07998084189671781, + "language_loss": 0.83062428, + "learning_rate": 0.0008961862145444103, + "loss": 0.84227335, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.17285156, + "step": 1209, + "time_per_iteration": 2.7639503479003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161441, + "balance_loss_mlp": 1.14392972, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.07404933879866919, + "language_loss": 0.85019284, + "learning_rate": 0.0008959960851117059, + "loss": 0.86180723, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.17541504, + "step": 1210, + "time_per_iteration": 2.639765739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142071, + "balance_loss_mlp": 1.12463081, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.06764705739880358, + "language_loss": 0.83661717, + "learning_rate": 0.0008958058019396868, + "loss": 0.8480379, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.17468262, + "step": 1211, + "time_per_iteration": 2.8551721572875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114749, + "balance_loss_mlp": 1.13016868, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.08875501668915448, + "language_loss": 0.86489981, + "learning_rate": 0.0008956153651022274, + "loss": 0.87637472, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.17333984, + "step": 1212, + "time_per_iteration": 2.7765469551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144101, + "balance_loss_mlp": 1.12625563, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.07932001584083075, + "language_loss": 0.83832914, + "learning_rate": 0.0008954247746732618, + "loss": 0.84977019, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.17858887, + "step": 1213, + "time_per_iteration": 2.6084651947021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135394, + "balance_loss_mlp": 1.11788201, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.07442615591494516, + "language_loss": 0.90398782, + "learning_rate": 0.0008952340307267837, + "loss": 0.91534173, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.17529297, + "step": 1214, + "time_per_iteration": 2.89178466796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125335, + "balance_loss_mlp": 1.10793078, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.1453012637227399, + "language_loss": 0.8336947, + "learning_rate": 0.0008950431333368468, + "loss": 0.84494805, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.17419434, + "step": 1215, + "time_per_iteration": 2.5870306491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111701, + "balance_loss_mlp": 1.09912825, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.07975417299664793, + "language_loss": 0.84537351, + "learning_rate": 0.0008948520825775634, + "loss": 0.8565436, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.17919922, + "step": 1216, + "time_per_iteration": 3.6591601371765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111106, + "balance_loss_mlp": 1.0930953, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.083699003973451, + "language_loss": 0.83777452, + "learning_rate": 0.0008946608785231067, + "loss": 0.84888518, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.1796875, + "step": 1217, + "time_per_iteration": 2.910045862197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122887, + "balance_loss_mlp": 1.10500622, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.07421571727754571, + "language_loss": 0.8465637, + "learning_rate": 0.0008944695212477084, + "loss": 0.85779262, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.17871094, + "step": 1218, + "time_per_iteration": 2.524942636489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136819, + "balance_loss_mlp": 1.11900902, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.08988714641466837, + "language_loss": 0.85843921, + "learning_rate": 0.0008942780108256599, + "loss": 0.86980736, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.17822266, + "step": 1219, + "time_per_iteration": 2.638685703277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122459, + "balance_loss_mlp": 1.10441041, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.09147837202786416, + "language_loss": 0.86524791, + "learning_rate": 0.0008940863473313121, + "loss": 0.87647247, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.18054199, + "step": 1220, + "time_per_iteration": 2.5017247200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141789, + "balance_loss_mlp": 1.12406206, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.08221984397196716, + "language_loss": 0.87834692, + "learning_rate": 0.0008938945308390756, + "loss": 0.88976479, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.17724609, + "step": 1221, + "time_per_iteration": 2.663565158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.1284095, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.07596551545175816, + "language_loss": 0.86929715, + "learning_rate": 0.00089370256142342, + "loss": 0.88075024, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.16918945, + "step": 1222, + "time_per_iteration": 2.7675375938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143, + "balance_loss_mlp": 1.12577403, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.07111090095827391, + "language_loss": 0.84719163, + "learning_rate": 0.0008935104391588746, + "loss": 0.8586216, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.17248535, + "step": 1223, + "time_per_iteration": 2.7930641174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141088, + "balance_loss_mlp": 1.12308729, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.09172313762061536, + "language_loss": 0.83210915, + "learning_rate": 0.0008933181641200276, + "loss": 0.84352005, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.18005371, + "step": 1224, + "time_per_iteration": 3.184723138809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113861, + "balance_loss_mlp": 1.1213243, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.08544958772393396, + "language_loss": 0.85490656, + "learning_rate": 0.0008931257363815271, + "loss": 0.86629266, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.1730957, + "step": 1225, + "time_per_iteration": 2.9049925804138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116947, + "balance_loss_mlp": 1.09978044, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.08572157059192624, + "language_loss": 0.8983537, + "learning_rate": 0.0008929331560180798, + "loss": 0.90952325, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.171875, + "step": 1226, + "time_per_iteration": 2.976716995239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119433, + "balance_loss_mlp": 1.10198092, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.07629670414533757, + "language_loss": 0.90995669, + "learning_rate": 0.0008927404231044525, + "loss": 0.92115104, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.17468262, + "step": 1227, + "time_per_iteration": 2.754908561706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103828, + "balance_loss_mlp": 1.08611393, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.07882349010207228, + "language_loss": 0.81471217, + "learning_rate": 0.0008925475377154703, + "loss": 0.82575047, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.17736816, + "step": 1228, + "time_per_iteration": 2.7809646129608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100869, + "balance_loss_mlp": 1.08254623, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.07142925877548961, + "language_loss": 0.82040304, + "learning_rate": 0.0008923544999260183, + "loss": 0.83141172, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.18322754, + "step": 1229, + "time_per_iteration": 2.760239362716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110144, + "balance_loss_mlp": 1.09266782, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.12387153159230253, + "language_loss": 0.91337013, + "learning_rate": 0.00089216130981104, + "loss": 0.92447156, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.17480469, + "step": 1230, + "time_per_iteration": 3.121588945388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110904, + "balance_loss_mlp": 1.09090781, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.07661504881361146, + "language_loss": 0.82228827, + "learning_rate": 0.000891967967445539, + "loss": 0.83337867, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.18139648, + "step": 1231, + "time_per_iteration": 2.7672059535980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109921, + "balance_loss_mlp": 1.0920639, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.054732650189263314, + "language_loss": 0.88646662, + "learning_rate": 0.0008917744729045772, + "loss": 0.89756578, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.17871094, + "step": 1232, + "time_per_iteration": 2.9028637409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104934, + "balance_loss_mlp": 1.08743405, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.08391850168433768, + "language_loss": 0.83650339, + "learning_rate": 0.0008915808262632757, + "loss": 0.84755272, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.1751709, + "step": 1233, + "time_per_iteration": 2.870555877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123449, + "balance_loss_mlp": 1.10509062, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.09539034143195195, + "language_loss": 0.92907977, + "learning_rate": 0.0008913870275968148, + "loss": 0.94031429, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.18359375, + "step": 1234, + "time_per_iteration": 2.7251648902893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109776, + "balance_loss_mlp": 1.09154916, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.06697050939505883, + "language_loss": 0.87199342, + "learning_rate": 0.0008911930769804342, + "loss": 0.88309121, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.18237305, + "step": 1235, + "time_per_iteration": 3.268287420272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124889, + "balance_loss_mlp": 1.10593486, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.08058060241162714, + "language_loss": 0.91074061, + "learning_rate": 0.0008909989744894318, + "loss": 0.92198944, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.1895752, + "step": 1236, + "time_per_iteration": 2.8918802738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118206, + "balance_loss_mlp": 1.10007429, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.11301283658583765, + "language_loss": 0.81326294, + "learning_rate": 0.0008908047201991649, + "loss": 0.82444501, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.18127441, + "step": 1237, + "time_per_iteration": 2.8053224086761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111628, + "balance_loss_mlp": 1.09433031, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.0928222329851358, + "language_loss": 0.86241579, + "learning_rate": 0.0008906103141850502, + "loss": 0.87353206, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.17321777, + "step": 1238, + "time_per_iteration": 2.90500545501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117636, + "balance_loss_mlp": 1.09980249, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.08449694721293455, + "language_loss": 0.87626004, + "learning_rate": 0.0008904157565225621, + "loss": 0.88743639, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.17834473, + "step": 1239, + "time_per_iteration": 2.687969923019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126339, + "balance_loss_mlp": 1.10839748, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.08713278777958322, + "language_loss": 0.815947, + "learning_rate": 0.000890221047287235, + "loss": 0.82721043, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.17944336, + "step": 1240, + "time_per_iteration": 3.531710386276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134139, + "balance_loss_mlp": 1.11636496, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.07670600064189544, + "language_loss": 0.90527886, + "learning_rate": 0.0008900261865546615, + "loss": 0.91662019, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.17797852, + "step": 1241, + "time_per_iteration": 2.6662704944610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152535, + "balance_loss_mlp": 1.13414097, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.12487758336027797, + "language_loss": 0.84415132, + "learning_rate": 0.0008898311744004936, + "loss": 0.85567665, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.18408203, + "step": 1242, + "time_per_iteration": 2.763388156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149998, + "balance_loss_mlp": 1.13165212, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.06740377455140158, + "language_loss": 0.86921692, + "learning_rate": 0.0008896360109004414, + "loss": 0.88071686, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.18359375, + "step": 1243, + "time_per_iteration": 2.6441633701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140632, + "balance_loss_mlp": 1.12121248, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.09575659644731266, + "language_loss": 0.84275168, + "learning_rate": 0.0008894406961302742, + "loss": 0.85415804, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.1940918, + "step": 1244, + "time_per_iteration": 2.6425938606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112564, + "balance_loss_mlp": 1.10582733, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.07353599262773654, + "language_loss": 0.83287829, + "learning_rate": 0.0008892452301658201, + "loss": 0.84413469, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.19799805, + "step": 1245, + "time_per_iteration": 2.9552412033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105353, + "balance_loss_mlp": 1.08604133, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.06971047839699994, + "language_loss": 0.83254242, + "learning_rate": 0.0008890496130829653, + "loss": 0.84359598, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.19287109, + "step": 1246, + "time_per_iteration": 2.714538812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094484, + "balance_loss_mlp": 1.07490993, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.07160128232814054, + "language_loss": 0.85448045, + "learning_rate": 0.0008888538449576555, + "loss": 0.86542535, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.19567871, + "step": 1247, + "time_per_iteration": 2.5854134559631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081471, + "balance_loss_mlp": 1.06212282, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.10364601092251456, + "language_loss": 0.82938588, + "learning_rate": 0.0008886579258658944, + "loss": 0.84020054, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.1932373, + "step": 1248, + "time_per_iteration": 2.56381893157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085911, + "balance_loss_mlp": 1.06643224, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.11636637674492897, + "language_loss": 0.84617007, + "learning_rate": 0.0008884618558837446, + "loss": 0.8570292, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.19470215, + "step": 1249, + "time_per_iteration": 2.8670427799224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092713, + "balance_loss_mlp": 1.07287669, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.09934462101700196, + "language_loss": 0.86105502, + "learning_rate": 0.0008882656350873273, + "loss": 0.87198216, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.19836426, + "step": 1250, + "time_per_iteration": 2.9198391437530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095988, + "balance_loss_mlp": 1.07702184, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.10386778667644601, + "language_loss": 0.86847913, + "learning_rate": 0.0008880692635528219, + "loss": 0.879439, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.1895752, + "step": 1251, + "time_per_iteration": 3.114600658416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08975875, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.09512533379834028, + "language_loss": 0.89605117, + "learning_rate": 0.0008878727413564669, + "loss": 0.90713388, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.18518066, + "step": 1252, + "time_per_iteration": 2.7784321308135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_mlp": 1.0333159, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.02598255704274824, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81180501, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.11572266, + "step": 1253, + "time_per_iteration": 4.945368528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142164, + "balance_loss_mlp": 1.12338829, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.08359922246859781, + "language_loss": 0.78146553, + "learning_rate": 0.0008874792452834528, + "loss": 0.79288721, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.18774414, + "step": 1254, + "time_per_iteration": 2.765700340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12684703, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.08184252001830684, + "language_loss": 0.87274945, + "learning_rate": 0.0008872822715595626, + "loss": 0.88419414, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.17626953, + "step": 1255, + "time_per_iteration": 2.687319040298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141993, + "balance_loss_mlp": 1.12460077, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.10883062221863066, + "language_loss": 0.86691022, + "learning_rate": 0.0008870851474793598, + "loss": 0.87833017, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.17419434, + "step": 1256, + "time_per_iteration": 2.6231887340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136562, + "balance_loss_mlp": 1.11930037, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.08915320009922777, + "language_loss": 0.89053321, + "learning_rate": 0.0008868878731193752, + "loss": 0.90189886, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.17285156, + "step": 1257, + "time_per_iteration": 2.928931713104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113174, + "balance_loss_mlp": 1.11484766, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.08262742442990392, + "language_loss": 0.89427495, + "learning_rate": 0.0008866904485561973, + "loss": 0.90559232, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.16906738, + "step": 1258, + "time_per_iteration": 2.7494447231292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136898, + "balance_loss_mlp": 1.11986327, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.08559449998713918, + "language_loss": 0.82794583, + "learning_rate": 0.000886492873866473, + "loss": 0.83931482, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.17053223, + "step": 1259, + "time_per_iteration": 2.841770648956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112569, + "balance_loss_mlp": 1.10853612, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.12665734927529698, + "language_loss": 0.8437835, + "learning_rate": 0.000886295149126908, + "loss": 0.85504043, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.17163086, + "step": 1260, + "time_per_iteration": 2.7847495079040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119489, + "balance_loss_mlp": 1.10270476, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.13276121908066757, + "language_loss": 0.85482794, + "learning_rate": 0.0008860972744142655, + "loss": 0.86602283, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.16796875, + "step": 1261, + "time_per_iteration": 2.9415853023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117567, + "balance_loss_mlp": 1.10078192, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.09469206100439348, + "language_loss": 0.81489432, + "learning_rate": 0.0008858992498053671, + "loss": 0.82606995, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.16796875, + "step": 1262, + "time_per_iteration": 2.8460397720336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058087, + "balance_loss_mlp": 1.04578424, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.030096600393216412, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.7764684, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.12304688, + "step": 1263, + "time_per_iteration": 4.891434192657471 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164356, + "balance_loss_mlp": 1.14685583, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.07687362244804527, + "language_loss": 0.83471984, + "learning_rate": 0.0008855027512063817, + "loss": 0.84636343, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.17504883, + "step": 1264, + "time_per_iteration": 2.729905843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188155, + "balance_loss_mlp": 1.17034483, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.10565566639423048, + "language_loss": 0.85338992, + "learning_rate": 0.0008853042773702292, + "loss": 0.86527145, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.17810059, + "step": 1265, + "time_per_iteration": 2.7027270793914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213519, + "balance_loss_mlp": 1.19497013, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.10310511352597752, + "language_loss": 0.87869942, + "learning_rate": 0.0008851056539456896, + "loss": 0.89083463, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.1854248, + "step": 1266, + "time_per_iteration": 2.7062103748321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190822, + "balance_loss_mlp": 1.17235637, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.106198185782814, + "language_loss": 0.81649381, + "learning_rate": 0.0008849068810098755, + "loss": 0.82840204, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.18469238, + "step": 1267, + "time_per_iteration": 3.329357862472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169809, + "balance_loss_mlp": 1.15086627, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.11133940138273103, + "language_loss": 0.82717752, + "learning_rate": 0.0008847079586399575, + "loss": 0.83887565, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.18945312, + "step": 1268, + "time_per_iteration": 2.558319091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131294, + "balance_loss_mlp": 1.11318588, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.08817279245044941, + "language_loss": 0.85679001, + "learning_rate": 0.0008845088869131641, + "loss": 0.86810291, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.18103027, + "step": 1269, + "time_per_iteration": 2.692885637283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122646, + "balance_loss_mlp": 1.10412109, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.07664646159291034, + "language_loss": 0.88602984, + "learning_rate": 0.0008843096659067818, + "loss": 0.89725631, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.18505859, + "step": 1270, + "time_per_iteration": 2.688197374343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117697, + "balance_loss_mlp": 1.09989929, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.06543357243765746, + "language_loss": 0.86065173, + "learning_rate": 0.000884110295698155, + "loss": 0.87182868, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.17822266, + "step": 1271, + "time_per_iteration": 2.9497103691101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113614, + "balance_loss_mlp": 1.09520805, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.10345235518870362, + "language_loss": 0.85674417, + "learning_rate": 0.0008839107763646861, + "loss": 0.86788034, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.18395996, + "step": 1272, + "time_per_iteration": 2.6293063163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111354, + "balance_loss_mlp": 1.09307909, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.0866440520117465, + "language_loss": 0.90339661, + "learning_rate": 0.0008837111079838353, + "loss": 0.91451013, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.18273926, + "step": 1273, + "time_per_iteration": 2.7676210403442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112497, + "balance_loss_mlp": 1.10732698, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.08933257913148762, + "language_loss": 0.89889824, + "learning_rate": 0.000883511290633121, + "loss": 0.91014791, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.17651367, + "step": 1274, + "time_per_iteration": 2.5634043216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111162, + "balance_loss_mlp": 1.09361923, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.08498045219099847, + "language_loss": 0.92045552, + "learning_rate": 0.000883311324390119, + "loss": 0.93157172, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.18005371, + "step": 1275, + "time_per_iteration": 2.688175678253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117687, + "balance_loss_mlp": 1.09850657, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.093400697768974, + "language_loss": 0.81587857, + "learning_rate": 0.0008831112093324629, + "loss": 0.82705545, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.19177246, + "step": 1276, + "time_per_iteration": 3.0782830715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120052, + "balance_loss_mlp": 1.10156226, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.07571489376575821, + "language_loss": 0.88611054, + "learning_rate": 0.0008829109455378444, + "loss": 0.89731109, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.18481445, + "step": 1277, + "time_per_iteration": 2.7325568199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130651, + "balance_loss_mlp": 1.11251891, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.08746979241051268, + "language_loss": 0.86345637, + "learning_rate": 0.000882710533084013, + "loss": 0.87476289, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.18139648, + "step": 1278, + "time_per_iteration": 2.647641658782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113502, + "balance_loss_mlp": 1.11687636, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.0699906863373026, + "language_loss": 0.89239269, + "learning_rate": 0.0008825099720487755, + "loss": 0.90374291, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.18164062, + "step": 1279, + "time_per_iteration": 2.647472858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108592, + "balance_loss_mlp": 1.07490551, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.04364177649541596, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76347059, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.11035156, + "step": 1280, + "time_per_iteration": 4.876530647277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056171, + "balance_loss_mlp": 1.04515576, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.029948837084711404, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79000282, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.11035156, + "step": 1281, + "time_per_iteration": 4.817251205444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130582, + "balance_loss_mlp": 1.11283183, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.0778912228408071, + "language_loss": 0.89449739, + "learning_rate": 0.0008819073982335619, + "loss": 0.9058032, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.17773438, + "step": 1282, + "time_per_iteration": 2.849764823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139737, + "balance_loss_mlp": 1.12209415, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.06136900444292705, + "language_loss": 0.84456879, + "learning_rate": 0.0008817062436519235, + "loss": 0.85596615, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.17651367, + "step": 1283, + "time_per_iteration": 2.662811040878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126818, + "balance_loss_mlp": 1.10860264, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.11946768082571088, + "language_loss": 0.895989, + "learning_rate": 0.0008815049408787788, + "loss": 0.90725714, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.18212891, + "step": 1284, + "time_per_iteration": 2.5498671531677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118472, + "balance_loss_mlp": 1.10030437, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.07911934764568136, + "language_loss": 0.85533321, + "learning_rate": 0.0008813034899922805, + "loss": 0.86651796, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.1817627, + "step": 1285, + "time_per_iteration": 2.546613931655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112528, + "balance_loss_mlp": 1.10687399, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.09325179905503529, + "language_loss": 0.89224762, + "learning_rate": 0.0008811018910706387, + "loss": 0.90350044, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.18395996, + "step": 1286, + "time_per_iteration": 2.5715928077697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124504, + "balance_loss_mlp": 1.10582423, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.08651255320330896, + "language_loss": 0.81603038, + "learning_rate": 0.0008809001441921211, + "loss": 0.82727551, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.18688965, + "step": 1287, + "time_per_iteration": 2.76352858543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116455, + "balance_loss_mlp": 1.09800124, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.07934964537800443, + "language_loss": 0.85291266, + "learning_rate": 0.0008806982494350528, + "loss": 0.86407721, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.18457031, + "step": 1288, + "time_per_iteration": 2.6464178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125382, + "balance_loss_mlp": 1.10674942, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.07889330448691204, + "language_loss": 0.89930373, + "learning_rate": 0.0008804962068778161, + "loss": 0.91055757, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.18615723, + "step": 1289, + "time_per_iteration": 2.8725006580352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123355, + "balance_loss_mlp": 1.10481799, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.09114492679937135, + "language_loss": 0.80640042, + "learning_rate": 0.0008802940165988511, + "loss": 0.81763393, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.18530273, + "step": 1290, + "time_per_iteration": 2.9053151607513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113226, + "balance_loss_mlp": 1.11324596, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.07850606096458997, + "language_loss": 0.88298845, + "learning_rate": 0.000880091678676655, + "loss": 0.89431107, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.18981934, + "step": 1291, + "time_per_iteration": 2.8338379859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115571, + "balance_loss_mlp": 1.09697485, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.0792961220184265, + "language_loss": 0.89043152, + "learning_rate": 0.0008798891931897821, + "loss": 0.90158725, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.18579102, + "step": 1292, + "time_per_iteration": 2.7769196033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121528, + "balance_loss_mlp": 1.10277641, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.0746346978796093, + "language_loss": 0.84222198, + "learning_rate": 0.0008796865602168447, + "loss": 0.8534373, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.18737793, + "step": 1293, + "time_per_iteration": 2.560858964920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115627, + "balance_loss_mlp": 1.09803176, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.06740604853273545, + "language_loss": 0.88270545, + "learning_rate": 0.0008794837798365115, + "loss": 0.89386165, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.17614746, + "step": 1294, + "time_per_iteration": 2.6477129459381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125631, + "balance_loss_mlp": 1.10763049, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.0873145111764115, + "language_loss": 0.88408256, + "learning_rate": 0.0008792808521275089, + "loss": 0.89533883, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.18017578, + "step": 1295, + "time_per_iteration": 2.7224135398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121076, + "balance_loss_mlp": 1.10262191, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.0692696283298791, + "language_loss": 0.87340117, + "learning_rate": 0.0008790777771686206, + "loss": 0.88461185, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.18444824, + "step": 1296, + "time_per_iteration": 2.61126446723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113013, + "balance_loss_mlp": 1.09509635, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.07573373752967896, + "language_loss": 0.84983516, + "learning_rate": 0.0008788745550386872, + "loss": 0.86096525, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.17932129, + "step": 1297, + "time_per_iteration": 2.573880672454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09876418, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.10171762649266601, + "language_loss": 0.797032, + "learning_rate": 0.0008786711858166063, + "loss": 0.80820251, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.18286133, + "step": 1298, + "time_per_iteration": 2.9712767601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123606, + "balance_loss_mlp": 1.10497391, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.0822091876784568, + "language_loss": 0.83161783, + "learning_rate": 0.0008784676695813332, + "loss": 0.8428539, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.1862793, + "step": 1299, + "time_per_iteration": 2.966691017150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129878, + "balance_loss_mlp": 1.11144853, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.08080427389944742, + "language_loss": 0.84450245, + "learning_rate": 0.0008782640064118796, + "loss": 0.85580122, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.18408203, + "step": 1300, + "time_per_iteration": 2.92551589012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240263, + "balance_loss_mlp": 1.22471797, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.06645546985774646, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77425015, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.15527344, + "step": 1301, + "time_per_iteration": 4.9493842124938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114228, + "balance_loss_mlp": 1.12376654, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.09006790660725612, + "language_loss": 0.8623417, + "learning_rate": 0.0008778562395867648, + "loss": 0.87376451, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.18518066, + "step": 1302, + "time_per_iteration": 2.635500907897949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122782, + "balance_loss_mlp": 1.10403061, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.07479626477523657, + "language_loss": 0.83630598, + "learning_rate": 0.0008776521360894127, + "loss": 0.84753382, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.1875, + "step": 1303, + "time_per_iteration": 2.640951156616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090617, + "balance_loss_mlp": 1.07707512, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.0418328343897397, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80052686, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13574219, + "step": 1304, + "time_per_iteration": 4.842891454696655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104198, + "balance_loss_mlp": 1.08618569, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.0798377990367126, + "language_loss": 0.90237606, + "learning_rate": 0.0008772434893213186, + "loss": 0.91341805, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.18017578, + "step": 1305, + "time_per_iteration": 2.6264374256134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097469, + "balance_loss_mlp": 1.07925391, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.07815304176143087, + "language_loss": 0.84344316, + "learning_rate": 0.0008770389462092276, + "loss": 0.85441786, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.18225098, + "step": 1306, + "time_per_iteration": 2.6599185466766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093714, + "balance_loss_mlp": 1.07480729, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.08248282915226902, + "language_loss": 0.86642498, + "learning_rate": 0.0008768342567176357, + "loss": 0.87736213, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.18908691, + "step": 1307, + "time_per_iteration": 2.919123411178589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094095, + "balance_loss_mlp": 1.07524765, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.07892434793160769, + "language_loss": 0.90316761, + "learning_rate": 0.0008766294209260107, + "loss": 0.91410857, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.18859863, + "step": 1308, + "time_per_iteration": 2.703994035720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093703, + "balance_loss_mlp": 1.07496333, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.09325948106778781, + "language_loss": 0.9126637, + "learning_rate": 0.0008764244389138767, + "loss": 0.92360079, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.18725586, + "step": 1309, + "time_per_iteration": 2.6175687313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092261, + "balance_loss_mlp": 1.07365251, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.10626806402083949, + "language_loss": 0.81772095, + "learning_rate": 0.000876219310760815, + "loss": 0.82864356, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.18603516, + "step": 1310, + "time_per_iteration": 2.8659133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097983, + "balance_loss_mlp": 1.07988715, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.13076548306856256, + "language_loss": 0.81004, + "learning_rate": 0.0008760140365464631, + "loss": 0.82101983, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.18103027, + "step": 1311, + "time_per_iteration": 2.646810531616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120372, + "balance_loss_mlp": 1.10276532, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.11580551837549759, + "language_loss": 0.87203217, + "learning_rate": 0.0008758086163505156, + "loss": 0.88323587, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.17626953, + "step": 1312, + "time_per_iteration": 2.601256847381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135664, + "balance_loss_mlp": 1.11779475, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.0666103465279768, + "language_loss": 0.89063561, + "learning_rate": 0.0008756030502527239, + "loss": 0.90199232, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.17883301, + "step": 1313, + "time_per_iteration": 2.8330187797546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161369, + "balance_loss_mlp": 1.14360678, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.0708022330446315, + "language_loss": 0.90153992, + "learning_rate": 0.0008753973383328954, + "loss": 0.91315365, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.17785645, + "step": 1314, + "time_per_iteration": 2.685375928878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011545, + "balance_loss_mlp": 1.13647509, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.08974334028560671, + "language_loss": 0.83722651, + "learning_rate": 0.0008751914806708952, + "loss": 0.84877157, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.18029785, + "step": 1315, + "time_per_iteration": 2.6155343055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164677, + "balance_loss_mlp": 1.14708161, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.08978858583773926, + "language_loss": 0.81837153, + "learning_rate": 0.0008749854773466439, + "loss": 0.83001828, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.17614746, + "step": 1316, + "time_per_iteration": 2.7219769954681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163056, + "balance_loss_mlp": 1.14553261, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.07528804981442601, + "language_loss": 0.8451466, + "learning_rate": 0.0008747793284401192, + "loss": 0.85677719, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.17541504, + "step": 1317, + "time_per_iteration": 2.7144973278045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151322, + "balance_loss_mlp": 1.13359582, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.08898497659473818, + "language_loss": 0.85280555, + "learning_rate": 0.0008745730340313551, + "loss": 0.86431873, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.17736816, + "step": 1318, + "time_per_iteration": 2.7930002212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13595057, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.08370435102239727, + "language_loss": 0.84217906, + "learning_rate": 0.0008743665942004422, + "loss": 0.85371482, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.1763916, + "step": 1319, + "time_per_iteration": 2.68245530128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160638, + "balance_loss_mlp": 1.14311421, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.07392804364708638, + "language_loss": 0.92852235, + "learning_rate": 0.0008741600090275277, + "loss": 0.9401288, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.17529297, + "step": 1320, + "time_per_iteration": 2.5977306365966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163682, + "balance_loss_mlp": 1.14569294, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.10450079995548846, + "language_loss": 0.8392204, + "learning_rate": 0.0008739532785928151, + "loss": 0.8508572, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.17993164, + "step": 1321, + "time_per_iteration": 3.464723587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181344, + "balance_loss_mlp": 1.16827822, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.05258117628035473, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.76074928, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.13085938, + "step": 1322, + "time_per_iteration": 4.845709562301636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194039, + "balance_loss_mlp": 1.17626476, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.080849834949414, + "language_loss": 0.83025825, + "learning_rate": 0.0008735393822590908, + "loss": 0.84219867, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.17785645, + "step": 1323, + "time_per_iteration": 2.7540626525878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204948, + "balance_loss_mlp": 1.18740082, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.08178952973842966, + "language_loss": 0.86670357, + "learning_rate": 0.0008733322165207681, + "loss": 0.87875307, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.17578125, + "step": 1324, + "time_per_iteration": 2.6596570014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203339, + "balance_loss_mlp": 1.18555284, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.08051520692213045, + "language_loss": 0.82727516, + "learning_rate": 0.0008731249058420247, + "loss": 0.8393085, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.17810059, + "step": 1325, + "time_per_iteration": 3.082704782485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197065, + "balance_loss_mlp": 1.17887366, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.07988786822753648, + "language_loss": 0.90256196, + "learning_rate": 0.0008729174503033459, + "loss": 0.9145326, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.18188477, + "step": 1326, + "time_per_iteration": 2.663212299346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163002, + "balance_loss_mlp": 1.14462042, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.09140325585124401, + "language_loss": 0.82217562, + "learning_rate": 0.0008727098499852728, + "loss": 0.83380556, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.18383789, + "step": 1327, + "time_per_iteration": 2.859302520751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114294, + "balance_loss_mlp": 1.12451005, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.07316654776483361, + "language_loss": 0.89623642, + "learning_rate": 0.0008725021049684034, + "loss": 0.90766573, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.18432617, + "step": 1328, + "time_per_iteration": 2.7523410320281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09832358, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.06969820691150284, + "language_loss": 0.82930326, + "learning_rate": 0.000872294215333391, + "loss": 0.84047389, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.18713379, + "step": 1329, + "time_per_iteration": 3.243213415145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.08953917, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.08533282388950945, + "language_loss": 0.82889348, + "learning_rate": 0.0008720861811609457, + "loss": 0.83997935, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.19042969, + "step": 1330, + "time_per_iteration": 2.789504051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086894, + "balance_loss_mlp": 1.06807089, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.08137535215054885, + "language_loss": 0.83645493, + "learning_rate": 0.0008718780025318338, + "loss": 0.84732389, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.18823242, + "step": 1331, + "time_per_iteration": 2.7668251991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092113, + "balance_loss_mlp": 1.07411242, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.08447566633159821, + "language_loss": 0.83860987, + "learning_rate": 0.0008716696795268771, + "loss": 0.84953099, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.18017578, + "step": 1332, + "time_per_iteration": 2.71281099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088022, + "balance_loss_mlp": 1.06994987, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.08355917909814405, + "language_loss": 0.85442013, + "learning_rate": 0.0008714612122269538, + "loss": 0.8653003, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.1809082, + "step": 1333, + "time_per_iteration": 2.9077794551849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108256, + "balance_loss_mlp": 1.09015965, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.09490231540823739, + "language_loss": 0.89133245, + "learning_rate": 0.0008712526007129982, + "loss": 0.90241498, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.18103027, + "step": 1334, + "time_per_iteration": 2.5269079208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127264, + "balance_loss_mlp": 1.10958493, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.09530184614586146, + "language_loss": 0.90164447, + "learning_rate": 0.0008710438450660003, + "loss": 0.91291702, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.17687988, + "step": 1335, + "time_per_iteration": 2.690424680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127744, + "balance_loss_mlp": 1.10994577, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.09938976745138839, + "language_loss": 0.87409496, + "learning_rate": 0.0008708349453670064, + "loss": 0.88537246, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.17810059, + "step": 1336, + "time_per_iteration": 2.5319509506225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128077, + "balance_loss_mlp": 1.10982585, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.08461134195014028, + "language_loss": 0.91159999, + "learning_rate": 0.0008706259016971185, + "loss": 0.92288077, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.18249512, + "step": 1337, + "time_per_iteration": 2.8355276584625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133843, + "balance_loss_mlp": 1.11533022, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.1004001057114973, + "language_loss": 0.82634485, + "learning_rate": 0.0008704167141374944, + "loss": 0.83768326, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.18518066, + "step": 1338, + "time_per_iteration": 2.83562970161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125326, + "balance_loss_mlp": 1.10650253, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.07535387519287148, + "language_loss": 0.87972409, + "learning_rate": 0.0008702073827693482, + "loss": 0.89097726, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.18823242, + "step": 1339, + "time_per_iteration": 2.7440268993377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121252, + "balance_loss_mlp": 1.10240531, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.07907705856450171, + "language_loss": 0.8856355, + "learning_rate": 0.0008699979076739494, + "loss": 0.89684802, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.18847656, + "step": 1340, + "time_per_iteration": 2.985356092453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132949, + "balance_loss_mlp": 1.11369705, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.10358510275764175, + "language_loss": 0.88529009, + "learning_rate": 0.0008697882889326234, + "loss": 0.89661956, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.19238281, + "step": 1341, + "time_per_iteration": 2.564622163772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136453, + "balance_loss_mlp": 1.11695075, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.09783747399550236, + "language_loss": 0.8651613, + "learning_rate": 0.0008695785266267515, + "loss": 0.87652576, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.19482422, + "step": 1342, + "time_per_iteration": 2.7061781883239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147948, + "balance_loss_mlp": 1.12840939, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.08416519118542358, + "language_loss": 0.83111393, + "learning_rate": 0.0008693686208377704, + "loss": 0.84259331, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.19543457, + "step": 1343, + "time_per_iteration": 2.8751444816589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150711, + "balance_loss_mlp": 1.13156581, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.07899493252865974, + "language_loss": 0.88980556, + "learning_rate": 0.0008691585716471733, + "loss": 0.90131271, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.19140625, + "step": 1344, + "time_per_iteration": 2.6969785690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159409, + "balance_loss_mlp": 1.14027607, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.06941419908962602, + "language_loss": 0.8544178, + "learning_rate": 0.0008689483791365079, + "loss": 0.86601192, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.19116211, + "step": 1345, + "time_per_iteration": 2.8562369346618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154974, + "balance_loss_mlp": 1.13669968, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.07286553563097259, + "language_loss": 0.89186096, + "learning_rate": 0.0008687380433873786, + "loss": 0.90341073, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.18273926, + "step": 1346, + "time_per_iteration": 2.7854301929473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173599, + "balance_loss_mlp": 1.15573001, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.11357363401175323, + "language_loss": 0.82125735, + "learning_rate": 0.0008685275644814448, + "loss": 0.83299333, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.17883301, + "step": 1347, + "time_per_iteration": 2.6921608448028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116629, + "balance_loss_mlp": 1.14855206, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07639398633752482, + "language_loss": 0.8419714, + "learning_rate": 0.0008683169425004216, + "loss": 0.85363436, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.17773438, + "step": 1348, + "time_per_iteration": 2.9085500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153795, + "balance_loss_mlp": 1.13597322, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.09519621553180321, + "language_loss": 0.8328886, + "learning_rate": 0.0008681061775260799, + "loss": 0.84442651, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.17834473, + "step": 1349, + "time_per_iteration": 2.8755290508270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143034, + "balance_loss_mlp": 1.12578487, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.10298645875309809, + "language_loss": 0.92206728, + "learning_rate": 0.0008678952696402458, + "loss": 0.93349767, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.17260742, + "step": 1350, + "time_per_iteration": 2.530040740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128339, + "balance_loss_mlp": 1.11113763, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.07054972097096389, + "language_loss": 0.85973078, + "learning_rate": 0.000867684218924801, + "loss": 0.87101424, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.17211914, + "step": 1351, + "time_per_iteration": 2.924776077270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135752, + "balance_loss_mlp": 1.12478447, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.07057525744027235, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80082846, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.10986328, + "step": 1352, + "time_per_iteration": 4.937533378601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127686, + "balance_loss_mlp": 1.11084199, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.06384913215279323, + "language_loss": 0.85261834, + "learning_rate": 0.0008672616893328834, + "loss": 0.86389524, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.1685791, + "step": 1353, + "time_per_iteration": 2.9442062377929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122527, + "balance_loss_mlp": 1.10589719, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.09199225792086613, + "language_loss": 0.90041292, + "learning_rate": 0.0008670502106204512, + "loss": 0.91163814, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.16638184, + "step": 1354, + "time_per_iteration": 2.840792417526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132378, + "balance_loss_mlp": 1.11488962, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.0749682309300763, + "language_loss": 0.81919277, + "learning_rate": 0.0008668385894064892, + "loss": 0.83051658, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.1751709, + "step": 1355, + "time_per_iteration": 2.649226665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150444, + "balance_loss_mlp": 1.13379025, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.10108237113866697, + "language_loss": 0.89089942, + "learning_rate": 0.0008666268257731562, + "loss": 0.90240383, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.16662598, + "step": 1356, + "time_per_iteration": 3.1606926918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_mlp": 1.13520908, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.09285423546908722, + "language_loss": 0.85545158, + "learning_rate": 0.0008664149198026662, + "loss": 0.86697471, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.17126465, + "step": 1357, + "time_per_iteration": 3.286130428314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164462, + "balance_loss_mlp": 1.14699829, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.08517439685870379, + "language_loss": 0.88857412, + "learning_rate": 0.0008662028715772883, + "loss": 0.90021884, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.17480469, + "step": 1358, + "time_per_iteration": 2.6877803802490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157352, + "balance_loss_mlp": 1.13951862, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.08437519054308197, + "language_loss": 0.85356647, + "learning_rate": 0.0008659906811793467, + "loss": 0.86514002, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.1784668, + "step": 1359, + "time_per_iteration": 2.701963186264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152325, + "balance_loss_mlp": 1.13483691, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.09516463994255123, + "language_loss": 0.89262813, + "learning_rate": 0.0008657783486912215, + "loss": 0.90415138, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.17504883, + "step": 1360, + "time_per_iteration": 2.7410097122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150671, + "balance_loss_mlp": 1.1330992, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.06828467212359378, + "language_loss": 0.8976928, + "learning_rate": 0.0008655658741953472, + "loss": 0.90919948, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.17590332, + "step": 1361, + "time_per_iteration": 3.2329330444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138416, + "balance_loss_mlp": 1.12074876, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.06454511059104741, + "language_loss": 0.88249099, + "learning_rate": 0.0008653532577742136, + "loss": 0.89387512, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.17675781, + "step": 1362, + "time_per_iteration": 2.746363401412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139921, + "balance_loss_mlp": 1.12302947, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.07711827630070714, + "language_loss": 0.86794758, + "learning_rate": 0.0008651404995103659, + "loss": 0.87934673, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.16906738, + "step": 1363, + "time_per_iteration": 2.5565500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132814, + "balance_loss_mlp": 1.11538577, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.08155880386034024, + "language_loss": 0.8709327, + "learning_rate": 0.0008649275994864041, + "loss": 0.8822608, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.17431641, + "step": 1364, + "time_per_iteration": 2.716562032699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133153, + "balance_loss_mlp": 1.11586761, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.06672959076804742, + "language_loss": 0.83875144, + "learning_rate": 0.0008647145577849834, + "loss": 0.85008299, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.1730957, + "step": 1365, + "time_per_iteration": 2.8476812839508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129924, + "balance_loss_mlp": 1.11255515, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.0668808093236692, + "language_loss": 0.82936931, + "learning_rate": 0.0008645013744888139, + "loss": 0.8406685, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.17382812, + "step": 1366, + "time_per_iteration": 2.891817092895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127692, + "balance_loss_mlp": 1.11063313, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.08778385712395331, + "language_loss": 0.87274009, + "learning_rate": 0.0008642880496806607, + "loss": 0.88401705, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.17077637, + "step": 1367, + "time_per_iteration": 2.8053958415985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120237, + "balance_loss_mlp": 1.10274851, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.07681621031760291, + "language_loss": 0.84336966, + "learning_rate": 0.0008640745834433437, + "loss": 0.85457206, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.17504883, + "step": 1368, + "time_per_iteration": 2.787339925765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121438, + "balance_loss_mlp": 1.10430789, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.09521927305918056, + "language_loss": 0.86539549, + "learning_rate": 0.000863860975859738, + "loss": 0.87660992, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.17126465, + "step": 1369, + "time_per_iteration": 2.9646191596984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114699, + "balance_loss_mlp": 1.0977838, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.08138719928792186, + "language_loss": 0.87995172, + "learning_rate": 0.0008636472270127733, + "loss": 0.89109874, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.16918945, + "step": 1370, + "time_per_iteration": 2.646869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110661, + "balance_loss_mlp": 1.08878803, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.09119402348134849, + "language_loss": 0.90394557, + "learning_rate": 0.0008634333369854345, + "loss": 0.91501164, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.1784668, + "step": 1371, + "time_per_iteration": 2.630207061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101355, + "balance_loss_mlp": 1.083915, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.08212786438033774, + "language_loss": 0.87634504, + "learning_rate": 0.0008632193058607608, + "loss": 0.88735861, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.17456055, + "step": 1372, + "time_per_iteration": 2.7757019996643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114382, + "balance_loss_mlp": 1.09665525, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.10317877520485044, + "language_loss": 0.80747414, + "learning_rate": 0.0008630051337218466, + "loss": 0.81861794, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.17736816, + "step": 1373, + "time_per_iteration": 2.7459805011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09961104, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.08099527295858751, + "language_loss": 0.82020557, + "learning_rate": 0.0008627908206518409, + "loss": 0.83137608, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.17456055, + "step": 1374, + "time_per_iteration": 2.719428300857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113813, + "balance_loss_mlp": 1.12554145, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.042063102349752246, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76289386, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.12597656, + "step": 1375, + "time_per_iteration": 4.988332748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112187, + "balance_loss_mlp": 1.09442437, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.06812086657274741, + "language_loss": 0.91138768, + "learning_rate": 0.0008623617720514241, + "loss": 0.92250949, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.1776123, + "step": 1376, + "time_per_iteration": 2.644531726837158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109794, + "balance_loss_mlp": 1.09182918, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.0722091181333716, + "language_loss": 0.84490621, + "learning_rate": 0.0008621470366875848, + "loss": 0.85600418, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.1796875, + "step": 1377, + "time_per_iteration": 2.605417490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100816, + "balance_loss_mlp": 1.08375728, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.07263229866332392, + "language_loss": 0.87396085, + "learning_rate": 0.0008619321607257966, + "loss": 0.884969, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.17077637, + "step": 1378, + "time_per_iteration": 2.7229108810424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100855, + "balance_loss_mlp": 1.08392727, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.07341413806820511, + "language_loss": 0.82002622, + "learning_rate": 0.000861717144249482, + "loss": 0.83103478, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.16943359, + "step": 1379, + "time_per_iteration": 2.9031612873077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105487, + "balance_loss_mlp": 1.08884549, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06987190342408907, + "language_loss": 0.89693463, + "learning_rate": 0.0008615019873421175, + "loss": 0.9079895, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.16650391, + "step": 1380, + "time_per_iteration": 2.5554280281066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105329, + "balance_loss_mlp": 1.08804345, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.07960659576711203, + "language_loss": 0.85129094, + "learning_rate": 0.0008612866900872349, + "loss": 0.86234426, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.17297363, + "step": 1381, + "time_per_iteration": 2.560756206512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115387, + "balance_loss_mlp": 1.0986619, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.10185032090542295, + "language_loss": 0.87836969, + "learning_rate": 0.0008610712525684197, + "loss": 0.88952351, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.1673584, + "step": 1382, + "time_per_iteration": 2.649127721786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.09392381, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.09094270381931494, + "language_loss": 0.84048492, + "learning_rate": 0.0008608556748693121, + "loss": 0.85159665, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.17260742, + "step": 1383, + "time_per_iteration": 3.2573940753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109064, + "balance_loss_mlp": 1.09163558, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.0818167871774861, + "language_loss": 0.859007, + "learning_rate": 0.000860639957073607, + "loss": 0.87009764, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.17443848, + "step": 1384, + "time_per_iteration": 2.7120518684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110901, + "balance_loss_mlp": 1.0937109, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.07681443511092155, + "language_loss": 0.87386912, + "learning_rate": 0.0008604240992650534, + "loss": 0.88497818, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.17211914, + "step": 1385, + "time_per_iteration": 2.69921612739563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113027, + "balance_loss_mlp": 1.09546757, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.06494344058238215, + "language_loss": 0.88934892, + "learning_rate": 0.0008602081015274545, + "loss": 0.9004792, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.17553711, + "step": 1386, + "time_per_iteration": 2.7353157997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117717, + "balance_loss_mlp": 1.10092068, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.06900257884101904, + "language_loss": 0.83328801, + "learning_rate": 0.0008599919639446684, + "loss": 0.8444652, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.16809082, + "step": 1387, + "time_per_iteration": 2.6927597522735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110335, + "balance_loss_mlp": 1.09289455, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.08338734757979376, + "language_loss": 0.79947424, + "learning_rate": 0.000859775686600607, + "loss": 0.81057751, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.17468262, + "step": 1388, + "time_per_iteration": 2.5740597248077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123987, + "balance_loss_mlp": 1.10719037, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.09984082638450108, + "language_loss": 0.84917498, + "learning_rate": 0.0008595592695792367, + "loss": 0.86041486, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.16809082, + "step": 1389, + "time_per_iteration": 2.6907854080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112253, + "balance_loss_mlp": 1.10618591, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.06989103866242331, + "language_loss": 0.90147883, + "learning_rate": 0.0008593427129645778, + "loss": 0.91270411, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.16345215, + "step": 1390, + "time_per_iteration": 2.6145434379577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120602, + "balance_loss_mlp": 1.10381722, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.07905482313842922, + "language_loss": 0.85086334, + "learning_rate": 0.0008591260168407052, + "loss": 0.86206937, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.16796875, + "step": 1391, + "time_per_iteration": 2.787076711654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117002, + "balance_loss_mlp": 1.10062313, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.0789554563697551, + "language_loss": 0.8226018, + "learning_rate": 0.0008589091812917479, + "loss": 0.83377182, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.16381836, + "step": 1392, + "time_per_iteration": 2.6753129959106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122652, + "balance_loss_mlp": 1.10604584, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07614476371572584, + "language_loss": 0.84920317, + "learning_rate": 0.0008586922064018887, + "loss": 0.86042964, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.1661377, + "step": 1393, + "time_per_iteration": 2.716813325881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114509, + "balance_loss_mlp": 1.09750938, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.08000570031295028, + "language_loss": 0.89098954, + "learning_rate": 0.0008584750922553651, + "loss": 0.90213466, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.17016602, + "step": 1394, + "time_per_iteration": 3.1575980186462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121389, + "balance_loss_mlp": 1.10477114, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.0683134764251081, + "language_loss": 0.83357704, + "learning_rate": 0.0008582578389364677, + "loss": 0.84479094, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.16625977, + "step": 1395, + "time_per_iteration": 2.885806083679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127129, + "balance_loss_mlp": 1.10989153, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.08737379963197432, + "language_loss": 0.91578317, + "learning_rate": 0.0008580404465295422, + "loss": 0.92705452, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.17260742, + "step": 1396, + "time_per_iteration": 2.849519968032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135341, + "balance_loss_mlp": 1.1180197, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.08461023567525901, + "language_loss": 0.8857668, + "learning_rate": 0.0008578229151189876, + "loss": 0.89712024, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.17321777, + "step": 1397, + "time_per_iteration": 2.94858980178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127453, + "balance_loss_mlp": 1.10984576, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.12493178829468786, + "language_loss": 0.81211323, + "learning_rate": 0.0008576052447892573, + "loss": 0.82338774, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.17614746, + "step": 1398, + "time_per_iteration": 2.534120798110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135254, + "balance_loss_mlp": 1.1178261, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.06803844431236612, + "language_loss": 0.85910499, + "learning_rate": 0.000857387435624858, + "loss": 0.87045753, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.17456055, + "step": 1399, + "time_per_iteration": 2.554008960723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159537, + "balance_loss_mlp": 1.1418941, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.0815296826798993, + "language_loss": 0.87922233, + "learning_rate": 0.0008571694877103513, + "loss": 0.8908177, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.17663574, + "step": 1400, + "time_per_iteration": 3.2941367626190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173169, + "balance_loss_mlp": 1.15442979, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.09384983289618287, + "language_loss": 0.8761692, + "learning_rate": 0.0008569514011303515, + "loss": 0.88790089, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.1875, + "step": 1401, + "time_per_iteration": 2.814588785171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157764, + "balance_loss_mlp": 1.1397872, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.09439986590001768, + "language_loss": 0.87801731, + "learning_rate": 0.0008567331759695277, + "loss": 0.88959491, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.17980957, + "step": 1402, + "time_per_iteration": 2.765251398086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144715, + "balance_loss_mlp": 1.12577283, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.08321050634823257, + "language_loss": 0.85899508, + "learning_rate": 0.0008565148123126023, + "loss": 0.87044227, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.18933105, + "step": 1403, + "time_per_iteration": 2.7100989818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125241, + "balance_loss_mlp": 1.10733557, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.0728098596241797, + "language_loss": 0.86166966, + "learning_rate": 0.0008562963102443516, + "loss": 0.87292206, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.17907715, + "step": 1404, + "time_per_iteration": 2.7286291122436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112919, + "balance_loss_mlp": 1.09493017, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.10158619193030523, + "language_loss": 0.84717911, + "learning_rate": 0.0008560776698496056, + "loss": 0.85830832, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.17993164, + "step": 1405, + "time_per_iteration": 2.9067912101745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103079, + "balance_loss_mlp": 1.08472061, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.08020634125989436, + "language_loss": 0.85596079, + "learning_rate": 0.0008558588912132481, + "loss": 0.86699152, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.18359375, + "step": 1406, + "time_per_iteration": 2.880148410797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071193, + "balance_loss_mlp": 1.05955815, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.03626473669965315, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77530181, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11621094, + "step": 1407, + "time_per_iteration": 4.905766487121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087497, + "balance_loss_mlp": 1.06903148, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.0815781254437323, + "language_loss": 0.82643741, + "learning_rate": 0.0008554209195555016, + "loss": 0.83731234, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.18481445, + "step": 1408, + "time_per_iteration": 2.759427309036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086012, + "balance_loss_mlp": 1.06754613, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.08207637293966, + "language_loss": 0.87980115, + "learning_rate": 0.0008552017267041483, + "loss": 0.89066136, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.18457031, + "step": 1409, + "time_per_iteration": 2.71040678024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_mlp": 1.06865954, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.0734300404961751, + "language_loss": 0.83141303, + "learning_rate": 0.0008549823959512549, + "loss": 0.84229583, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.19616699, + "step": 1410, + "time_per_iteration": 2.6883578300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104836, + "balance_loss_mlp": 1.08663297, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.07342840956593329, + "language_loss": 0.86307788, + "learning_rate": 0.0008547629273819728, + "loss": 0.87412632, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.18212891, + "step": 1411, + "time_per_iteration": 3.4179537296295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110447, + "balance_loss_mlp": 1.09208882, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.07902892919535931, + "language_loss": 0.83264589, + "learning_rate": 0.0008545433210815074, + "loss": 0.84375036, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.18347168, + "step": 1412, + "time_per_iteration": 2.644336462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132524, + "balance_loss_mlp": 1.11396301, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.08239543530107682, + "language_loss": 0.87688351, + "learning_rate": 0.0008543235771351176, + "loss": 0.88820869, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.18554688, + "step": 1413, + "time_per_iteration": 2.7242777347564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140498, + "balance_loss_mlp": 1.12286687, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.06292390757949942, + "language_loss": 0.84580851, + "learning_rate": 0.0008541036956281154, + "loss": 0.85721344, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.17651367, + "step": 1414, + "time_per_iteration": 2.917314052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149081, + "balance_loss_mlp": 1.13212919, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.09608953935856007, + "language_loss": 0.81591362, + "learning_rate": 0.0008538836766458665, + "loss": 0.82740438, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.16967773, + "step": 1415, + "time_per_iteration": 2.8857710361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115948, + "balance_loss_mlp": 1.14234948, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.09141970967130493, + "language_loss": 0.84791577, + "learning_rate": 0.0008536635202737897, + "loss": 0.85951054, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.17150879, + "step": 1416, + "time_per_iteration": 2.8404181003570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168227, + "balance_loss_mlp": 1.15094137, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.08934509912200893, + "language_loss": 0.81624401, + "learning_rate": 0.0008534432265973573, + "loss": 0.82792622, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.1730957, + "step": 1417, + "time_per_iteration": 2.636125326156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117943, + "balance_loss_mlp": 1.16220391, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.09636198633360953, + "language_loss": 0.87909538, + "learning_rate": 0.000853222795702095, + "loss": 0.89088964, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.17248535, + "step": 1418, + "time_per_iteration": 3.452954053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168866, + "balance_loss_mlp": 1.15174711, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.09586408952292569, + "language_loss": 0.83810413, + "learning_rate": 0.0008530022276735813, + "loss": 0.84979284, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.17138672, + "step": 1419, + "time_per_iteration": 2.74656081199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160265, + "balance_loss_mlp": 1.14302731, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.07361815357739941, + "language_loss": 0.8564744, + "learning_rate": 0.0008527815225974489, + "loss": 0.86807704, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.17260742, + "step": 1420, + "time_per_iteration": 2.6620352268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.14375329, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10060729288286506, + "language_loss": 0.88312179, + "learning_rate": 0.0008525606805593829, + "loss": 0.89473552, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.17651367, + "step": 1421, + "time_per_iteration": 2.4528608322143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152179, + "balance_loss_mlp": 1.13429809, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.0906337737142573, + "language_loss": 0.82765526, + "learning_rate": 0.0008523397016451213, + "loss": 0.83917701, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.17895508, + "step": 1422, + "time_per_iteration": 2.611370086669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146443, + "balance_loss_mlp": 1.12862146, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.0675988615568281, + "language_loss": 0.86714458, + "learning_rate": 0.0008521185859404564, + "loss": 0.87860906, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.17822266, + "step": 1423, + "time_per_iteration": 3.4147353172302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127875, + "balance_loss_mlp": 1.11027932, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.10391013903512737, + "language_loss": 0.89233863, + "learning_rate": 0.0008518973335312326, + "loss": 0.90361738, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.17602539, + "step": 1424, + "time_per_iteration": 2.8380019664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131922, + "balance_loss_mlp": 1.11418414, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.08776572848910039, + "language_loss": 0.83471692, + "learning_rate": 0.0008516759445033477, + "loss": 0.8460362, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.1776123, + "step": 1425, + "time_per_iteration": 2.6492245197296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148521, + "balance_loss_mlp": 1.13083041, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.09331893476455168, + "language_loss": 0.84960282, + "learning_rate": 0.0008514544189427526, + "loss": 0.86108804, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.17687988, + "step": 1426, + "time_per_iteration": 2.694824457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160501, + "balance_loss_mlp": 1.14289403, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.10058930784889258, + "language_loss": 0.86324757, + "learning_rate": 0.0008512327569354511, + "loss": 0.8748526, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.17602539, + "step": 1427, + "time_per_iteration": 2.5711381435394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170402, + "balance_loss_mlp": 1.15265131, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.08313733600620697, + "language_loss": 0.83505958, + "learning_rate": 0.0008510109585675001, + "loss": 0.84676361, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.17749023, + "step": 1428, + "time_per_iteration": 2.6291069984436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075627, + "balance_loss_mlp": 1.06465936, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.04529042076604016, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82228971, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.10986328, + "step": 1429, + "time_per_iteration": 4.732970952987671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151608, + "balance_loss_mlp": 1.13460922, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.10649873504882197, + "language_loss": 0.80186272, + "learning_rate": 0.0008505669530941415, + "loss": 0.81337881, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.17016602, + "step": 1430, + "time_per_iteration": 3.3425114154815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132102, + "balance_loss_mlp": 1.11454248, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.09668389067503143, + "language_loss": 0.83789647, + "learning_rate": 0.000850344746161112, + "loss": 0.84921753, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.17578125, + "step": 1431, + "time_per_iteration": 2.6212620735168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115216, + "balance_loss_mlp": 1.09790659, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.07650346740070771, + "language_loss": 0.87718683, + "learning_rate": 0.0008501224032121894, + "loss": 0.88833898, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.17321777, + "step": 1432, + "time_per_iteration": 2.531632900238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099408, + "balance_loss_mlp": 1.0818007, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.07599019403635421, + "language_loss": 0.81644619, + "learning_rate": 0.0008498999243336946, + "loss": 0.82744026, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.17626953, + "step": 1433, + "time_per_iteration": 2.6577858924865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108116, + "balance_loss_mlp": 1.09086609, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.08691171830183525, + "language_loss": 0.87290454, + "learning_rate": 0.0008496773096120021, + "loss": 0.8839857, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.17260742, + "step": 1434, + "time_per_iteration": 2.8218367099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103536, + "balance_loss_mlp": 1.08573806, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.09853984157164923, + "language_loss": 0.83996856, + "learning_rate": 0.0008494545591335381, + "loss": 0.85100389, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.17810059, + "step": 1435, + "time_per_iteration": 2.9297800064086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114552, + "balance_loss_mlp": 1.09671807, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.06137328591569865, + "language_loss": 0.86751276, + "learning_rate": 0.0008492316729847823, + "loss": 0.87865829, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.1784668, + "step": 1436, + "time_per_iteration": 2.8056235313415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111542, + "balance_loss_mlp": 1.09787273, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08045565015071575, + "language_loss": 0.79808342, + "learning_rate": 0.0008490086512522664, + "loss": 0.8092376, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.17565918, + "step": 1437, + "time_per_iteration": 2.7486345767974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125236, + "balance_loss_mlp": 1.10653245, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.07152243392964944, + "language_loss": 0.90246308, + "learning_rate": 0.0008487854940225755, + "loss": 0.91371536, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.18701172, + "step": 1438, + "time_per_iteration": 2.45500111579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119071, + "balance_loss_mlp": 1.10104609, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.12336147646099646, + "language_loss": 0.89520633, + "learning_rate": 0.0008485622013823466, + "loss": 0.9063971, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.18029785, + "step": 1439, + "time_per_iteration": 2.6394927501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116899, + "balance_loss_mlp": 1.09899366, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.08970889576331396, + "language_loss": 0.83229852, + "learning_rate": 0.00084833877341827, + "loss": 0.84346747, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.17895508, + "step": 1440, + "time_per_iteration": 2.673386812210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137485, + "balance_loss_mlp": 1.11953235, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.09818503582677594, + "language_loss": 0.8055383, + "learning_rate": 0.000848115210217088, + "loss": 0.81691313, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.17956543, + "step": 1441, + "time_per_iteration": 2.6129040718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143053, + "balance_loss_mlp": 1.12554169, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.08082573862086316, + "language_loss": 0.81372535, + "learning_rate": 0.0008478915118655952, + "loss": 0.82515597, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.17529297, + "step": 1442, + "time_per_iteration": 2.843041181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150917, + "balance_loss_mlp": 1.13371468, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.07560665817061937, + "language_loss": 0.86043841, + "learning_rate": 0.0008476676784506393, + "loss": 0.87194753, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.17224121, + "step": 1443, + "time_per_iteration": 2.669281005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145766, + "balance_loss_mlp": 1.12862349, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.07357545068984293, + "language_loss": 0.81809199, + "learning_rate": 0.0008474437100591201, + "loss": 0.82954967, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.17150879, + "step": 1444, + "time_per_iteration": 3.32959246635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112957, + "balance_loss_mlp": 1.1127255, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.08256066258120752, + "language_loss": 0.85183853, + "learning_rate": 0.0008472196067779898, + "loss": 0.86313421, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.1685791, + "step": 1445, + "time_per_iteration": 2.6932947635650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.09586096, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.1350534130118882, + "language_loss": 0.85003686, + "learning_rate": 0.0008469953686942531, + "loss": 0.86116487, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.16955566, + "step": 1446, + "time_per_iteration": 3.0903265476226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122325, + "balance_loss_mlp": 1.10539699, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.09027465145753444, + "language_loss": 0.82766867, + "learning_rate": 0.0008467709958949668, + "loss": 0.83889192, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.16943359, + "step": 1447, + "time_per_iteration": 2.7486042976379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.1059382, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.08057262764159107, + "language_loss": 0.85942835, + "learning_rate": 0.0008465464884672403, + "loss": 0.87065423, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.16662598, + "step": 1448, + "time_per_iteration": 2.7239129543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128319, + "balance_loss_mlp": 1.11145079, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.0722544104008292, + "language_loss": 0.85391676, + "learning_rate": 0.0008463218464982348, + "loss": 0.86520004, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.16882324, + "step": 1449, + "time_per_iteration": 2.824716329574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112804, + "balance_loss_mlp": 1.11102891, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.07814645269371487, + "language_loss": 0.8771199, + "learning_rate": 0.0008460970700751645, + "loss": 0.88840032, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.17016602, + "step": 1450, + "time_per_iteration": 3.1141586303710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126615, + "balance_loss_mlp": 1.10931802, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.07255444133213705, + "language_loss": 0.87776339, + "learning_rate": 0.000845872159285295, + "loss": 0.8890295, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.1730957, + "step": 1451, + "time_per_iteration": 2.739476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085209, + "balance_loss_mlp": 1.07529104, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.033234239085754465, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78852057, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.09912109, + "step": 1452, + "time_per_iteration": 4.910952806472778 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138861, + "balance_loss_mlp": 1.12121844, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.10385775803237589, + "language_loss": 0.86136031, + "learning_rate": 0.0008454219349544836, + "loss": 0.87274891, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.17651367, + "step": 1453, + "time_per_iteration": 3.3671629428863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121876, + "balance_loss_mlp": 1.10430491, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.07125574209855656, + "language_loss": 0.82064086, + "learning_rate": 0.000845196621588334, + "loss": 0.83185959, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.17602539, + "step": 1454, + "time_per_iteration": 2.775218963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125012, + "balance_loss_mlp": 1.107584, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.07195305251583452, + "language_loss": 0.7580061, + "learning_rate": 0.0008449711742049706, + "loss": 0.76925623, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.17443848, + "step": 1455, + "time_per_iteration": 2.785322427749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120523, + "balance_loss_mlp": 1.10295129, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.08382647519260926, + "language_loss": 0.83480191, + "learning_rate": 0.0008447455928919196, + "loss": 0.84600711, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.17590332, + "step": 1456, + "time_per_iteration": 2.660736083984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119281, + "balance_loss_mlp": 1.10179305, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.0678890613230097, + "language_loss": 0.86596936, + "learning_rate": 0.0008445198777367595, + "loss": 0.87716216, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.17492676, + "step": 1457, + "time_per_iteration": 2.5753204822540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121467, + "balance_loss_mlp": 1.10389531, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.10986912551565038, + "language_loss": 0.80972993, + "learning_rate": 0.0008442940288271208, + "loss": 0.82094461, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.17578125, + "step": 1458, + "time_per_iteration": 2.641165018081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112066, + "balance_loss_mlp": 1.10273051, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06853525506838967, + "language_loss": 0.86948931, + "learning_rate": 0.0008440680462506856, + "loss": 0.88069594, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.17932129, + "step": 1459, + "time_per_iteration": 2.7613425254821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.09818411, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.07519938139917645, + "language_loss": 0.86463004, + "learning_rate": 0.0008438419300951883, + "loss": 0.87578404, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.17224121, + "step": 1460, + "time_per_iteration": 2.684657335281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_mlp": 1.09928942, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.0687143759737579, + "language_loss": 0.86178434, + "learning_rate": 0.0008436156804484148, + "loss": 0.8729527, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.17565918, + "step": 1461, + "time_per_iteration": 2.860818386077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111314, + "balance_loss_mlp": 1.09343266, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.11710518654826144, + "language_loss": 0.88180649, + "learning_rate": 0.0008433892973982031, + "loss": 0.89291972, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.17883301, + "step": 1462, + "time_per_iteration": 2.58311128616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.08844042, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07819154550189573, + "language_loss": 0.84951186, + "learning_rate": 0.0008431627810324431, + "loss": 0.86057329, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.17724609, + "step": 1463, + "time_per_iteration": 2.6800074577331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111406, + "balance_loss_mlp": 1.09443069, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.06467590099086191, + "language_loss": 0.81057346, + "learning_rate": 0.000842936131439076, + "loss": 0.82168752, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.16992188, + "step": 1464, + "time_per_iteration": 2.6747214794158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111707, + "balance_loss_mlp": 1.09463668, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.06943840277913271, + "language_loss": 0.87714398, + "learning_rate": 0.0008427093487060951, + "loss": 0.88826108, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.17089844, + "step": 1465, + "time_per_iteration": 2.6723203659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113512, + "balance_loss_mlp": 1.09656, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.06709163317621891, + "language_loss": 0.846192, + "learning_rate": 0.000842482432921545, + "loss": 0.8573271, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.16955566, + "step": 1466, + "time_per_iteration": 2.8659911155700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0876503, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.07868097185173097, + "language_loss": 0.86230814, + "learning_rate": 0.0008422553841735225, + "loss": 0.87335783, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.17333984, + "step": 1467, + "time_per_iteration": 2.5069150924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109046, + "balance_loss_mlp": 1.09167767, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.07514750891429747, + "language_loss": 0.84737515, + "learning_rate": 0.0008420282025501757, + "loss": 0.85846567, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.17370605, + "step": 1468, + "time_per_iteration": 2.808751344680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094588, + "balance_loss_mlp": 1.07768393, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.0683968152950732, + "language_loss": 0.84884882, + "learning_rate": 0.0008418008881397043, + "loss": 0.85979474, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.16918945, + "step": 1469, + "time_per_iteration": 2.6929962635040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089051, + "balance_loss_mlp": 1.07267165, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.0720569823253329, + "language_loss": 0.82694614, + "learning_rate": 0.0008415734410303595, + "loss": 0.83783662, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.16381836, + "step": 1470, + "time_per_iteration": 3.2501566410064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095144, + "balance_loss_mlp": 1.07776332, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.07334017240809462, + "language_loss": 0.90763617, + "learning_rate": 0.0008413458613104444, + "loss": 0.91858757, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.17407227, + "step": 1471, + "time_per_iteration": 2.7336316108703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089358, + "balance_loss_mlp": 1.07198906, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.06835636483746928, + "language_loss": 0.82895148, + "learning_rate": 0.0008411181490683129, + "loss": 0.839845, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.17370605, + "step": 1472, + "time_per_iteration": 2.742314100265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085557, + "balance_loss_mlp": 1.0680809, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.08020623974692119, + "language_loss": 0.82316583, + "learning_rate": 0.0008408903043923707, + "loss": 0.83402139, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.17492676, + "step": 1473, + "time_per_iteration": 3.0307655334472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090161, + "balance_loss_mlp": 1.07230377, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09874308222598177, + "language_loss": 0.81175971, + "learning_rate": 0.0008406623273710754, + "loss": 0.8226614, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.17858887, + "step": 1474, + "time_per_iteration": 2.6652164459228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086824, + "balance_loss_mlp": 1.06919324, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.0806852987114514, + "language_loss": 0.82865691, + "learning_rate": 0.0008404342180929351, + "loss": 0.83952522, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.1763916, + "step": 1475, + "time_per_iteration": 2.676020622253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.06739831, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.0807623151432505, + "language_loss": 0.81497931, + "learning_rate": 0.00084020597664651, + "loss": 0.82583237, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.17907715, + "step": 1476, + "time_per_iteration": 2.8055877685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087711, + "balance_loss_mlp": 1.06957936, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.09698913749719028, + "language_loss": 0.83786356, + "learning_rate": 0.0008399776031204111, + "loss": 0.8487407, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.18139648, + "step": 1477, + "time_per_iteration": 2.7545149326324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087702, + "balance_loss_mlp": 1.06898642, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.09010893322506078, + "language_loss": 0.7971096, + "learning_rate": 0.0008397490976033009, + "loss": 0.80798662, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.18713379, + "step": 1478, + "time_per_iteration": 2.654254198074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107225, + "balance_loss_mlp": 1.06009066, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.04001675887347635, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78951895, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.12158203, + "step": 1479, + "time_per_iteration": 4.77993631362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088996, + "balance_loss_mlp": 1.07022035, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.07008895147668387, + "language_loss": 0.84977293, + "learning_rate": 0.0008392916909509525, + "loss": 0.86066294, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.18762207, + "step": 1480, + "time_per_iteration": 3.103787422180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110182, + "balance_loss_mlp": 1.08308077, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.07686502510285433, + "language_loss": 0.8518846, + "learning_rate": 0.0008390627899932954, + "loss": 0.86290276, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.18737793, + "step": 1481, + "time_per_iteration": 2.6177799701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113908, + "balance_loss_mlp": 1.09524012, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.10214098417508043, + "language_loss": 0.88852942, + "learning_rate": 0.000838833757399789, + "loss": 0.89966846, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.18664551, + "step": 1482, + "time_per_iteration": 2.9566540718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114933, + "balance_loss_mlp": 1.09678972, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.08257095939450843, + "language_loss": 0.80571115, + "learning_rate": 0.0008386045932593515, + "loss": 0.81686044, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.18139648, + "step": 1483, + "time_per_iteration": 2.717756509780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109957, + "balance_loss_mlp": 1.09277904, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.07262082200825942, + "language_loss": 0.86045611, + "learning_rate": 0.0008383752976609525, + "loss": 0.87155575, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.171875, + "step": 1484, + "time_per_iteration": 2.950330972671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113011, + "balance_loss_mlp": 1.09571338, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06349274760065945, + "language_loss": 0.7998122, + "learning_rate": 0.0008381458706936123, + "loss": 0.81094229, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.17321777, + "step": 1485, + "time_per_iteration": 2.750422239303589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105744, + "balance_loss_mlp": 1.08867359, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.08725606785490185, + "language_loss": 0.87347835, + "learning_rate": 0.0008379163124464025, + "loss": 0.88453579, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.17089844, + "step": 1486, + "time_per_iteration": 2.8127403259277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108247, + "balance_loss_mlp": 1.09145021, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.08194161324991753, + "language_loss": 0.7704097, + "learning_rate": 0.0008376866230084452, + "loss": 0.78149223, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.16809082, + "step": 1487, + "time_per_iteration": 2.8382246494293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.08535266, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.07305349361660647, + "language_loss": 0.85623455, + "learning_rate": 0.000837456802468914, + "loss": 0.8672576, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.16967773, + "step": 1488, + "time_per_iteration": 2.619359016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101386, + "balance_loss_mlp": 1.08414829, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.08101706440693511, + "language_loss": 0.85233498, + "learning_rate": 0.0008372268509170331, + "loss": 0.86334878, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.17248535, + "step": 1489, + "time_per_iteration": 2.735579252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_mlp": 1.08728886, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.09066736504037358, + "language_loss": 0.84989464, + "learning_rate": 0.0008369967684420779, + "loss": 0.86093777, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.17041016, + "step": 1490, + "time_per_iteration": 2.7550840377807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099327, + "balance_loss_mlp": 1.08251846, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.11208283725325253, + "language_loss": 0.84236765, + "learning_rate": 0.0008367665551333736, + "loss": 0.85336089, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.16821289, + "step": 1491, + "time_per_iteration": 2.6229591369628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_mlp": 1.10114861, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.08256436767566132, + "language_loss": 0.85062146, + "learning_rate": 0.0008365362110802977, + "loss": 0.86180484, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.17211914, + "step": 1492, + "time_per_iteration": 2.871260166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139482, + "balance_loss_mlp": 1.12254202, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.14712707580735673, + "language_loss": 0.82232606, + "learning_rate": 0.0008363057363722773, + "loss": 0.83372086, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.16955566, + "step": 1493, + "time_per_iteration": 2.8748109340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156529, + "balance_loss_mlp": 1.14010167, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.10196458183452421, + "language_loss": 0.84016562, + "learning_rate": 0.0008360751310987906, + "loss": 0.85173088, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.16430664, + "step": 1494, + "time_per_iteration": 2.6634154319763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156202, + "balance_loss_mlp": 1.13989449, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.07806891614800103, + "language_loss": 0.85166085, + "learning_rate": 0.0008358443953493666, + "loss": 0.8632229, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.16308594, + "step": 1495, + "time_per_iteration": 2.875852584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161766, + "balance_loss_mlp": 1.1449573, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.11619662908019952, + "language_loss": 0.88208884, + "learning_rate": 0.0008356135292135851, + "loss": 0.89370644, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.16821289, + "step": 1496, + "time_per_iteration": 2.5129776000976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129263, + "balance_loss_mlp": 1.11256182, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.0960393188024377, + "language_loss": 0.91794455, + "learning_rate": 0.0008353825327810758, + "loss": 0.92923725, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.16711426, + "step": 1497, + "time_per_iteration": 2.437619686126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109969, + "balance_loss_mlp": 1.09312487, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.09345990074491838, + "language_loss": 0.81679749, + "learning_rate": 0.00083515140614152, + "loss": 0.82789719, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.1685791, + "step": 1498, + "time_per_iteration": 2.7478325366973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119738, + "balance_loss_mlp": 1.10310864, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.10003726096036522, + "language_loss": 0.868577, + "learning_rate": 0.0008349201493846485, + "loss": 0.87977445, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.16625977, + "step": 1499, + "time_per_iteration": 2.639324188232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_mlp": 1.09971237, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.07951211502216154, + "language_loss": 0.89032578, + "learning_rate": 0.0008346887626002432, + "loss": 0.90149426, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.17150879, + "step": 1500, + "time_per_iteration": 2.542311668395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120306, + "balance_loss_mlp": 1.10360527, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.0665017309713035, + "language_loss": 0.85912937, + "learning_rate": 0.000834457245878137, + "loss": 0.87033248, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.16711426, + "step": 1501, + "time_per_iteration": 2.639570951461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122516, + "balance_loss_mlp": 1.10619664, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.07589763823888349, + "language_loss": 0.80857193, + "learning_rate": 0.000834225599308212, + "loss": 0.81979704, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.16320801, + "step": 1502, + "time_per_iteration": 3.2867560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113369, + "balance_loss_mlp": 1.11684537, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.09000351929941647, + "language_loss": 0.84986663, + "learning_rate": 0.0008339938229804016, + "loss": 0.86120355, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.1685791, + "step": 1503, + "time_per_iteration": 2.7262394428253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167456, + "balance_loss_mlp": 1.15496254, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.04837114619258858, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.7660228, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.12451172, + "step": 1504, + "time_per_iteration": 4.9622483253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129895, + "balance_loss_mlp": 1.11289549, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.1124140207378676, + "language_loss": 0.83872616, + "learning_rate": 0.0008335298814111094, + "loss": 0.85002512, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.17016602, + "step": 1505, + "time_per_iteration": 2.6357829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133506, + "balance_loss_mlp": 1.11616087, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.09211411957598506, + "language_loss": 0.87906271, + "learning_rate": 0.0008332977163497455, + "loss": 0.89039779, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.17370605, + "step": 1506, + "time_per_iteration": 2.798208475112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123584, + "balance_loss_mlp": 1.10653734, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.07286788522172229, + "language_loss": 0.83603442, + "learning_rate": 0.0008330654218907325, + "loss": 0.84727025, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.1706543, + "step": 1507, + "time_per_iteration": 2.708980083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112441, + "balance_loss_mlp": 1.09509647, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.06462764814837715, + "language_loss": 0.8140111, + "learning_rate": 0.0008328329981242548, + "loss": 0.82513553, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.17358398, + "step": 1508, + "time_per_iteration": 2.894169330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110509, + "balance_loss_mlp": 1.08767331, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.08188322832397743, + "language_loss": 0.87448251, + "learning_rate": 0.0008326004451405475, + "loss": 0.88553333, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.17443848, + "step": 1509, + "time_per_iteration": 2.8026657104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092866, + "balance_loss_mlp": 1.07596231, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.07862145855051805, + "language_loss": 0.81981707, + "learning_rate": 0.0008323677630298957, + "loss": 0.8307457, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.16918945, + "step": 1510, + "time_per_iteration": 2.6314613819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109107, + "balance_loss_mlp": 1.07407045, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.06795291351042136, + "language_loss": 0.84809089, + "learning_rate": 0.0008321349518826345, + "loss": 0.85900158, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.17016602, + "step": 1511, + "time_per_iteration": 2.8404459953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086604, + "balance_loss_mlp": 1.06950927, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.11455853074779208, + "language_loss": 0.95139891, + "learning_rate": 0.0008319020117891491, + "loss": 0.96226501, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.17102051, + "step": 1512, + "time_per_iteration": 2.6767001152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_mlp": 1.06650186, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.0847466939070868, + "language_loss": 0.86754417, + "learning_rate": 0.0008316689428398751, + "loss": 0.87838477, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.17565918, + "step": 1513, + "time_per_iteration": 2.7069385051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079727, + "balance_loss_mlp": 1.06318033, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.1225209310639027, + "language_loss": 0.88519126, + "learning_rate": 0.0008314357451252979, + "loss": 0.89598852, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.16552734, + "step": 1514, + "time_per_iteration": 2.8014771938323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088545, + "balance_loss_mlp": 1.07215357, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.09390151153588368, + "language_loss": 0.87912899, + "learning_rate": 0.0008312024187359527, + "loss": 0.89001441, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.16394043, + "step": 1515, + "time_per_iteration": 2.646131992340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.07367659, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.0632997915526053, + "language_loss": 0.87038326, + "learning_rate": 0.000830968963762425, + "loss": 0.88128293, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.1628418, + "step": 1516, + "time_per_iteration": 3.0603909492492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_mlp": 1.08745098, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.08225160647217689, + "language_loss": 0.83996677, + "learning_rate": 0.0008307353802953497, + "loss": 0.85100901, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.16784668, + "step": 1517, + "time_per_iteration": 2.7085869312286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105905, + "balance_loss_mlp": 1.08885777, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.07719324020211826, + "language_loss": 0.85852122, + "learning_rate": 0.0008305016684254125, + "loss": 0.86958027, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.17053223, + "step": 1518, + "time_per_iteration": 2.843050241470337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.0979718, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.07921278172023684, + "language_loss": 0.86861145, + "learning_rate": 0.0008302678282433479, + "loss": 0.87976027, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.16918945, + "step": 1519, + "time_per_iteration": 2.605964422225952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122329, + "balance_loss_mlp": 1.10534143, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07975311040882123, + "language_loss": 0.84663725, + "learning_rate": 0.0008300338598399411, + "loss": 0.85786051, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.17004395, + "step": 1520, + "time_per_iteration": 2.6344962120056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128968, + "balance_loss_mlp": 1.11150408, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.07139673380832469, + "language_loss": 0.9444648, + "learning_rate": 0.0008297997633060263, + "loss": 0.95575452, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.17480469, + "step": 1521, + "time_per_iteration": 2.5109918117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123567, + "balance_loss_mlp": 1.10605538, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07755113838475138, + "language_loss": 0.84917367, + "learning_rate": 0.0008295655387324883, + "loss": 0.86040938, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.17529297, + "step": 1522, + "time_per_iteration": 2.8314778804779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132674, + "balance_loss_mlp": 1.11578202, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.08909358029202981, + "language_loss": 0.84779286, + "learning_rate": 0.0008293311862102609, + "loss": 0.85911965, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.16894531, + "step": 1523, + "time_per_iteration": 2.5455641746520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112826, + "balance_loss_mlp": 1.11147499, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.07268877656623862, + "language_loss": 0.88628173, + "learning_rate": 0.0008290967058303275, + "loss": 0.89756435, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.16796875, + "step": 1524, + "time_per_iteration": 2.5151915550231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114491, + "balance_loss_mlp": 1.1288048, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07556317822889831, + "language_loss": 0.86503643, + "learning_rate": 0.0008288620976837219, + "loss": 0.87648547, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.16101074, + "step": 1525, + "time_per_iteration": 2.526381731033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145799, + "balance_loss_mlp": 1.12897861, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.07322803654736391, + "language_loss": 0.826621, + "learning_rate": 0.000828627361861527, + "loss": 0.83807898, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.16833496, + "step": 1526, + "time_per_iteration": 2.629249334335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146511, + "balance_loss_mlp": 1.13019073, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.08423530938833095, + "language_loss": 0.84572363, + "learning_rate": 0.0008283924984548752, + "loss": 0.8571887, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.16320801, + "step": 1527, + "time_per_iteration": 2.966165542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140198, + "balance_loss_mlp": 1.12374687, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.0645510946599831, + "language_loss": 0.8449617, + "learning_rate": 0.0008281575075549485, + "loss": 0.85636371, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.16455078, + "step": 1528, + "time_per_iteration": 2.58369779586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161514, + "balance_loss_mlp": 1.14954567, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.05917981842870205, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78514206, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.11962891, + "step": 1529, + "time_per_iteration": 4.658821105957031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131087, + "balance_loss_mlp": 1.1146121, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.08930626055051794, + "language_loss": 0.90355158, + "learning_rate": 0.0008276871436402469, + "loss": 0.91486251, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.16479492, + "step": 1530, + "time_per_iteration": 2.8411099910736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136163, + "balance_loss_mlp": 1.12017739, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.15569448105103711, + "language_loss": 0.87387383, + "learning_rate": 0.000827451770808083, + "loss": 0.88523543, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.15979004, + "step": 1531, + "time_per_iteration": 2.716938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126528, + "balance_loss_mlp": 1.11020815, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.07571292712277376, + "language_loss": 0.83393914, + "learning_rate": 0.0008272162708478674, + "loss": 0.84520441, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.16320801, + "step": 1532, + "time_per_iteration": 2.589401960372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125487, + "balance_loss_mlp": 1.10926247, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.0702796828307527, + "language_loss": 0.85952383, + "learning_rate": 0.000826980643851029, + "loss": 0.87077868, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.16223145, + "step": 1533, + "time_per_iteration": 2.730564594268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111085, + "balance_loss_mlp": 1.09442306, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.090864784531222, + "language_loss": 0.84450942, + "learning_rate": 0.0008267448899090464, + "loss": 0.85561788, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.16430664, + "step": 1534, + "time_per_iteration": 2.5810909271240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116842, + "balance_loss_mlp": 1.10008121, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.07312583256714535, + "language_loss": 0.80780327, + "learning_rate": 0.0008265090091134473, + "loss": 0.81897163, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.16760254, + "step": 1535, + "time_per_iteration": 2.852243423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101211, + "balance_loss_mlp": 1.08464038, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.06558641515181687, + "language_loss": 0.80252028, + "learning_rate": 0.0008262730015558088, + "loss": 0.81353235, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.16577148, + "step": 1536, + "time_per_iteration": 2.888068675994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094086, + "balance_loss_mlp": 1.07725406, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.0890497395672015, + "language_loss": 0.81906033, + "learning_rate": 0.0008260368673277574, + "loss": 0.83000118, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.16845703, + "step": 1537, + "time_per_iteration": 3.1171438694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089572, + "balance_loss_mlp": 1.07263255, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.08897837479493585, + "language_loss": 0.83872563, + "learning_rate": 0.0008258006065209682, + "loss": 0.84962142, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.16955566, + "step": 1538, + "time_per_iteration": 2.749382972717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083685, + "balance_loss_mlp": 1.06642318, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.09390517967885302, + "language_loss": 0.80569965, + "learning_rate": 0.0008255642192271657, + "loss": 0.81653649, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.17285156, + "step": 1539, + "time_per_iteration": 2.834967851638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.07543612, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.08140985627423285, + "language_loss": 0.8348605, + "learning_rate": 0.0008253277055381241, + "loss": 0.84579086, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.17602539, + "step": 1540, + "time_per_iteration": 2.8553531169891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109997, + "balance_loss_mlp": 1.08266127, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.07492894951417867, + "language_loss": 0.8559624, + "learning_rate": 0.0008250910655456658, + "loss": 0.86696208, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.17321777, + "step": 1541, + "time_per_iteration": 3.141746997833252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_mlp": 1.10318387, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.0890193674873045, + "language_loss": 0.83764815, + "learning_rate": 0.0008248542993416625, + "loss": 0.84886062, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.18054199, + "step": 1542, + "time_per_iteration": 2.634694814682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134671, + "balance_loss_mlp": 1.11682534, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.08265783697410327, + "language_loss": 0.83617258, + "learning_rate": 0.0008246174070180352, + "loss": 0.84751928, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.17871094, + "step": 1543, + "time_per_iteration": 2.7335524559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139596, + "balance_loss_mlp": 1.12247741, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09383563779300157, + "language_loss": 0.83888161, + "learning_rate": 0.0008243803886667537, + "loss": 0.85027754, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.17138672, + "step": 1544, + "time_per_iteration": 3.1672377586364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139138, + "balance_loss_mlp": 1.12212706, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.09212665263146659, + "language_loss": 0.7881431, + "learning_rate": 0.0008241432443798364, + "loss": 0.79953444, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.17028809, + "step": 1545, + "time_per_iteration": 2.8234944343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128999, + "balance_loss_mlp": 1.11242867, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.056688876570847646, + "language_loss": 0.85312325, + "learning_rate": 0.0008239059742493512, + "loss": 0.86441326, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.16577148, + "step": 1546, + "time_per_iteration": 2.7027690410614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134853, + "balance_loss_mlp": 1.11818719, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.09085945068897121, + "language_loss": 0.87215161, + "learning_rate": 0.0008236685783674142, + "loss": 0.8835001, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.16674805, + "step": 1547, + "time_per_iteration": 3.0873892307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183829, + "balance_loss_mlp": 1.1713357, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.05428295829147524, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77405024, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.12451172, + "step": 1548, + "time_per_iteration": 4.899101972579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134552, + "balance_loss_mlp": 1.11795831, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.08128040699091903, + "language_loss": 0.81818366, + "learning_rate": 0.0008231934097178955, + "loss": 0.82952917, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.16601562, + "step": 1549, + "time_per_iteration": 2.6477086544036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139569, + "balance_loss_mlp": 1.12291551, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.07828537838902122, + "language_loss": 0.85219073, + "learning_rate": 0.0008229556371347903, + "loss": 0.86358643, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.16650391, + "step": 1550, + "time_per_iteration": 3.0261847972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150744, + "balance_loss_mlp": 1.13455498, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.08823136620200941, + "language_loss": 0.78994125, + "learning_rate": 0.0008227177391691874, + "loss": 0.8014487, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.16186523, + "step": 1551, + "time_per_iteration": 3.180002212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136289, + "balance_loss_mlp": 1.11980236, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07744125357066795, + "language_loss": 0.89299029, + "learning_rate": 0.0008224797159134463, + "loss": 0.90435314, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.16491699, + "step": 1552, + "time_per_iteration": 2.739584445953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129762, + "balance_loss_mlp": 1.11325169, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.07274609898716765, + "language_loss": 0.83059317, + "learning_rate": 0.0008222415674599765, + "loss": 0.84189081, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.16516113, + "step": 1553, + "time_per_iteration": 3.1217970848083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.10149145, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.07468995972707258, + "language_loss": 0.82944036, + "learning_rate": 0.0008220032939012349, + "loss": 0.84062493, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.1697998, + "step": 1554, + "time_per_iteration": 2.737661600112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_mlp": 1.0940038, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.06534643910619843, + "language_loss": 0.87635672, + "learning_rate": 0.0008217648953297277, + "loss": 0.88746935, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.17272949, + "step": 1555, + "time_per_iteration": 2.9030354022979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109118, + "balance_loss_mlp": 1.09171319, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.07926146627709543, + "language_loss": 0.78007799, + "learning_rate": 0.0008215263718380095, + "loss": 0.79116917, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.17419434, + "step": 1556, + "time_per_iteration": 2.7085471153259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102748, + "balance_loss_mlp": 1.08450937, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.0948368117579541, + "language_loss": 0.84609628, + "learning_rate": 0.0008212877235186833, + "loss": 0.85712373, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.18237305, + "step": 1557, + "time_per_iteration": 2.7050936222076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136692, + "balance_loss_mlp": 1.12467551, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.04579697638503373, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78874254, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12011719, + "step": 1558, + "time_per_iteration": 4.93830418586731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.08031082, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.08681594057082924, + "language_loss": 0.81027186, + "learning_rate": 0.0008208100527678611, + "loss": 0.8212539, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.17907715, + "step": 1559, + "time_per_iteration": 2.6041250228881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.08412552, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.11630596930036842, + "language_loss": 0.78128254, + "learning_rate": 0.0008205710305218135, + "loss": 0.79229701, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.17333984, + "step": 1560, + "time_per_iteration": 3.0562148094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109931, + "balance_loss_mlp": 1.08225095, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.07630099015555136, + "language_loss": 0.89525402, + "learning_rate": 0.0008203318838190541, + "loss": 0.90624714, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.17077637, + "step": 1561, + "time_per_iteration": 2.7627954483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110369, + "balance_loss_mlp": 1.08669066, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.09266250591977641, + "language_loss": 0.84876859, + "learning_rate": 0.0008200926127524281, + "loss": 0.85980552, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.17016602, + "step": 1562, + "time_per_iteration": 2.699997663497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111077, + "balance_loss_mlp": 1.09415245, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.08848358123460635, + "language_loss": 0.82834399, + "learning_rate": 0.0008198532174148289, + "loss": 0.83945167, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.16625977, + "step": 1563, + "time_per_iteration": 2.728264570236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.07691729, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.03477061119396021, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81774914, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11816406, + "step": 1564, + "time_per_iteration": 4.858918905258179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148782, + "balance_loss_mlp": 1.13198543, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.1259196892608865, + "language_loss": 0.88896626, + "learning_rate": 0.0008193740542985244, + "loss": 0.9004541, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.16809082, + "step": 1565, + "time_per_iteration": 2.6722562313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165467, + "balance_loss_mlp": 1.14907598, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.1324055806972963, + "language_loss": 0.86720473, + "learning_rate": 0.0008191342867058467, + "loss": 0.8788594, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.16394043, + "step": 1566, + "time_per_iteration": 2.7314035892486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147033, + "balance_loss_mlp": 1.13058197, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.09630003386887155, + "language_loss": 0.83068216, + "learning_rate": 0.0008188943952142509, + "loss": 0.84215248, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.16455078, + "step": 1567, + "time_per_iteration": 2.8423235416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128472, + "balance_loss_mlp": 1.11148453, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.09368409570014515, + "language_loss": 0.82277513, + "learning_rate": 0.0008186543799168711, + "loss": 0.83405983, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.17004395, + "step": 1568, + "time_per_iteration": 3.1569459438323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.07919598, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.20562474195503389, + "language_loss": 0.88231719, + "learning_rate": 0.0008184142409068892, + "loss": 0.89327747, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.16845703, + "step": 1569, + "time_per_iteration": 3.0334763526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089793, + "balance_loss_mlp": 1.0729959, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.06986495925142319, + "language_loss": 0.86445761, + "learning_rate": 0.000818173978277536, + "loss": 0.87535548, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.16809082, + "step": 1570, + "time_per_iteration": 2.6637074947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085635, + "balance_loss_mlp": 1.06840897, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.09310337511128065, + "language_loss": 0.8345744, + "learning_rate": 0.000817933592122089, + "loss": 0.84543073, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.17236328, + "step": 1571, + "time_per_iteration": 2.693112850189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_mlp": 1.06780863, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.10986906736250873, + "language_loss": 0.83327937, + "learning_rate": 0.0008176930825338749, + "loss": 0.84413558, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.17810059, + "step": 1572, + "time_per_iteration": 2.609584331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.06848717, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.10627002925019795, + "language_loss": 0.88423979, + "learning_rate": 0.0008174524496062679, + "loss": 0.89510572, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.1809082, + "step": 1573, + "time_per_iteration": 2.9317731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085921, + "balance_loss_mlp": 1.06767023, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.08890838553235277, + "language_loss": 0.85423905, + "learning_rate": 0.0008172116934326894, + "loss": 0.86509824, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.18249512, + "step": 1574, + "time_per_iteration": 2.795232057571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.06757045, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.0994527497506169, + "language_loss": 0.87673843, + "learning_rate": 0.0008169708141066097, + "loss": 0.88759637, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.18212891, + "step": 1575, + "time_per_iteration": 2.587369203567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088984, + "balance_loss_mlp": 1.07053041, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.4142555186010625, + "language_loss": 0.90523762, + "learning_rate": 0.0008167298117215465, + "loss": 0.91612744, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.18432617, + "step": 1576, + "time_per_iteration": 2.591120481491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109689, + "balance_loss_mlp": 1.07822132, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.08528414160414997, + "language_loss": 0.87905335, + "learning_rate": 0.0008164886863710649, + "loss": 0.89002216, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.18652344, + "step": 1577, + "time_per_iteration": 2.9462757110595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130352, + "balance_loss_mlp": 1.11145782, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07426584678404557, + "language_loss": 0.85645878, + "learning_rate": 0.0008162474381487783, + "loss": 0.86776227, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.1887207, + "step": 1578, + "time_per_iteration": 3.1258718967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170066, + "balance_loss_mlp": 1.15105188, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.10196200235578438, + "language_loss": 0.849518, + "learning_rate": 0.0008160060671483475, + "loss": 0.86121869, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.19018555, + "step": 1579, + "time_per_iteration": 2.686903953552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193624, + "balance_loss_mlp": 1.17542076, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.11175205501845424, + "language_loss": 0.82875144, + "learning_rate": 0.0008157645734634809, + "loss": 0.84068769, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.18212891, + "step": 1580, + "time_per_iteration": 2.623169183731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146657, + "balance_loss_mlp": 1.13449764, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.05359937724929427, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78043151, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.12158203, + "step": 1581, + "time_per_iteration": 4.941681623458862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126623, + "balance_loss_mlp": 1.11465442, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.04979857074148905, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74341118, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11962891, + "step": 1582, + "time_per_iteration": 4.878013372421265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233201, + "balance_loss_mlp": 1.21421146, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.08528831092857085, + "language_loss": 0.8396011, + "learning_rate": 0.000815039357240067, + "loss": 0.85193312, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.18969727, + "step": 1583, + "time_per_iteration": 2.643695116043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228928, + "balance_loss_mlp": 1.21003366, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.10406683839721904, + "language_loss": 0.8531003, + "learning_rate": 0.0008147973737554952, + "loss": 0.86538959, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.18884277, + "step": 1584, + "time_per_iteration": 2.780329942703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201707, + "balance_loss_mlp": 1.18393278, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.07761853967257432, + "language_loss": 0.86104375, + "learning_rate": 0.000814555268055744, + "loss": 0.87306082, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.17785645, + "step": 1585, + "time_per_iteration": 2.6921656131744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196281, + "balance_loss_mlp": 1.17799401, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07850387771459345, + "language_loss": 0.86948889, + "learning_rate": 0.0008143130402348073, + "loss": 0.88145167, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.18273926, + "step": 1586, + "time_per_iteration": 2.6515746116638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165884, + "balance_loss_mlp": 1.14803839, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.0685053805978033, + "language_loss": 0.79063147, + "learning_rate": 0.0008140706903867265, + "loss": 0.80229032, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.17858887, + "step": 1587, + "time_per_iteration": 2.823451042175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158917, + "balance_loss_mlp": 1.14067745, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.09375856425609289, + "language_loss": 0.90278405, + "learning_rate": 0.0008138282186055897, + "loss": 0.91437322, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.18261719, + "step": 1588, + "time_per_iteration": 2.7146568298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147472, + "balance_loss_mlp": 1.12988853, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.0770581210118419, + "language_loss": 0.82476223, + "learning_rate": 0.0008135856249855331, + "loss": 0.83623695, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.17614746, + "step": 1589, + "time_per_iteration": 2.71938157081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141231, + "balance_loss_mlp": 1.12317085, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.10579892777067937, + "language_loss": 0.89201659, + "learning_rate": 0.0008133429096207398, + "loss": 0.90342891, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.18066406, + "step": 1590, + "time_per_iteration": 2.828059434890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326323, + "balance_loss_mlp": 1.31087315, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.09384482719125187, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76638579, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.15429688, + "step": 1591, + "time_per_iteration": 5.056639909744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.13997602, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.07055782584393462, + "language_loss": 0.86496353, + "learning_rate": 0.0008128571140339123, + "loss": 0.87654829, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.18505859, + "step": 1592, + "time_per_iteration": 2.6639931201934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148416, + "balance_loss_mlp": 1.12930679, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.0722691659040447, + "language_loss": 0.87266612, + "learning_rate": 0.0008126140340004805, + "loss": 0.88415021, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.19104004, + "step": 1593, + "time_per_iteration": 2.574216604232788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153692, + "balance_loss_mlp": 1.1345824, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.07242693719108233, + "language_loss": 0.81765437, + "learning_rate": 0.0008123708325995172, + "loss": 0.82919127, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.19104004, + "step": 1594, + "time_per_iteration": 3.2430498600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.14182544, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.08669645453401467, + "language_loss": 0.79659396, + "learning_rate": 0.0008121275099254414, + "loss": 0.80820298, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.19067383, + "step": 1595, + "time_per_iteration": 2.992558479309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116479, + "balance_loss_mlp": 1.14517975, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06321681758762837, + "language_loss": 0.88210988, + "learning_rate": 0.0008118840660727194, + "loss": 0.8937577, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.19592285, + "step": 1596, + "time_per_iteration": 2.655043840408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116269, + "balance_loss_mlp": 1.14316404, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.06781928625830316, + "language_loss": 0.87805635, + "learning_rate": 0.0008116405011358644, + "loss": 0.88968325, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.19519043, + "step": 1597, + "time_per_iteration": 3.180513620376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172311, + "balance_loss_mlp": 1.15260601, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.0749329830796044, + "language_loss": 0.79566741, + "learning_rate": 0.0008113968152094369, + "loss": 0.80739057, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.19702148, + "step": 1598, + "time_per_iteration": 2.6038942337036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164591, + "balance_loss_mlp": 1.14439654, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.09148494515579969, + "language_loss": 0.82006347, + "learning_rate": 0.0008111530083880438, + "loss": 0.83170938, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.2019043, + "step": 1599, + "time_per_iteration": 2.9283370971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155155, + "balance_loss_mlp": 1.13517594, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.08461014219336162, + "language_loss": 0.86254573, + "learning_rate": 0.0008109090807663399, + "loss": 0.87409735, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.19970703, + "step": 1600, + "time_per_iteration": 2.825857639312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137804, + "balance_loss_mlp": 1.11677539, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.062223790852464995, + "language_loss": 0.88488859, + "learning_rate": 0.0008106650324390257, + "loss": 0.89626658, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.21032715, + "step": 1601, + "time_per_iteration": 2.8589255809783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112197, + "balance_loss_mlp": 1.10128665, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.07165476987233708, + "language_loss": 0.81206429, + "learning_rate": 0.0008104208635008493, + "loss": 0.82328397, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.20690918, + "step": 1602, + "time_per_iteration": 2.6751368045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109456, + "balance_loss_mlp": 1.0886662, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.08196336802935668, + "language_loss": 0.81529546, + "learning_rate": 0.0008101765740466058, + "loss": 0.82638997, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.20788574, + "step": 1603, + "time_per_iteration": 2.5513291358947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103563, + "balance_loss_mlp": 1.08332109, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.0890222565523069, + "language_loss": 0.83796382, + "learning_rate": 0.0008099321641711364, + "loss": 0.8489995, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.20227051, + "step": 1604, + "time_per_iteration": 2.6779870986938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104404, + "balance_loss_mlp": 1.08353007, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.07300879059514653, + "language_loss": 0.83213902, + "learning_rate": 0.0008096876339693295, + "loss": 0.84318304, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.2088623, + "step": 1605, + "time_per_iteration": 2.667900800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.07006013, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.08337461956862639, + "language_loss": 0.81168187, + "learning_rate": 0.0008094429835361206, + "loss": 0.82259107, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.20861816, + "step": 1606, + "time_per_iteration": 3.0076494216918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081794, + "balance_loss_mlp": 1.06069374, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.10542585380202701, + "language_loss": 0.85789704, + "learning_rate": 0.0008091982129664908, + "loss": 0.86871505, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.21105957, + "step": 1607, + "time_per_iteration": 2.730372428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.06643414, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.07933352528165237, + "language_loss": 0.83225489, + "learning_rate": 0.0008089533223554687, + "loss": 0.84313411, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.21484375, + "step": 1608, + "time_per_iteration": 2.7049362659454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090604, + "balance_loss_mlp": 1.06942058, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.08271353671646894, + "language_loss": 0.85293424, + "learning_rate": 0.0008087083117981294, + "loss": 0.86384022, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.21179199, + "step": 1609, + "time_per_iteration": 2.8826427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101999, + "balance_loss_mlp": 1.08043373, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.0996721022061816, + "language_loss": 0.88292408, + "learning_rate": 0.0008084631813895943, + "loss": 0.89394403, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.21569824, + "step": 1610, + "time_per_iteration": 2.7805559635162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121386, + "balance_loss_mlp": 1.10027432, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07842877021383077, + "language_loss": 0.83548594, + "learning_rate": 0.0008082179312250315, + "loss": 0.84669983, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.21118164, + "step": 1611, + "time_per_iteration": 2.676135540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388019, + "balance_loss_mlp": 1.36951745, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.08809519842771894, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81243861, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.18457031, + "step": 1612, + "time_per_iteration": 4.860031843185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126274, + "balance_loss_mlp": 1.24729049, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.05130460412725523, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77892077, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.15429688, + "step": 1613, + "time_per_iteration": 5.034562110900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199222, + "balance_loss_mlp": 1.18011272, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.0938643891544465, + "language_loss": 0.82239884, + "learning_rate": 0.0008074814631475545, + "loss": 0.83439106, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.19091797, + "step": 1614, + "time_per_iteration": 3.336702585220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212604, + "balance_loss_mlp": 1.19325638, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.08076281903906762, + "language_loss": 0.79283953, + "learning_rate": 0.0008072357349114907, + "loss": 0.80496556, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.19335938, + "step": 1615, + "time_per_iteration": 2.6835010051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230065, + "balance_loss_mlp": 1.21150458, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.10215362910815345, + "language_loss": 0.88464314, + "learning_rate": 0.0008069898873959363, + "loss": 0.89694381, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.1854248, + "step": 1616, + "time_per_iteration": 2.669456958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213455, + "balance_loss_mlp": 1.19514489, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.07300003813068634, + "language_loss": 0.85508597, + "learning_rate": 0.0008067439206963375, + "loss": 0.86722052, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.18310547, + "step": 1617, + "time_per_iteration": 2.641707420349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202163, + "balance_loss_mlp": 1.18378067, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.08997138772617237, + "language_loss": 0.86023128, + "learning_rate": 0.0008064978349081873, + "loss": 0.87225294, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.18395996, + "step": 1618, + "time_per_iteration": 2.998687982559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181054, + "balance_loss_mlp": 1.1626246, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.07073814720845698, + "language_loss": 0.8619715, + "learning_rate": 0.0008062516301270245, + "loss": 0.87378204, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.1842041, + "step": 1619, + "time_per_iteration": 2.72948956489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187406, + "balance_loss_mlp": 1.16931009, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.06466481546389395, + "language_loss": 0.88310599, + "learning_rate": 0.0008060053064484343, + "loss": 0.89498007, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.18115234, + "step": 1620, + "time_per_iteration": 2.9406392574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_mlp": 1.17067063, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.09059197010434686, + "language_loss": 0.84835637, + "learning_rate": 0.0008057588639680482, + "loss": 0.86024034, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.17724609, + "step": 1621, + "time_per_iteration": 2.7712435722351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.15451908, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.0998699448976919, + "language_loss": 0.83181798, + "learning_rate": 0.0008055123027815434, + "loss": 0.84354383, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.18078613, + "step": 1622, + "time_per_iteration": 2.918195962905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158801, + "balance_loss_mlp": 1.14063358, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.08307305946300769, + "language_loss": 0.8472932, + "learning_rate": 0.0008052656229846436, + "loss": 0.85888124, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.18164062, + "step": 1623, + "time_per_iteration": 2.6911518573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141486, + "balance_loss_mlp": 1.12317586, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.13857337515277973, + "language_loss": 0.90054119, + "learning_rate": 0.0008050188246731182, + "loss": 0.91195607, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.18322754, + "step": 1624, + "time_per_iteration": 2.682352066040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132158, + "balance_loss_mlp": 1.11350143, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.07575228871239431, + "language_loss": 0.81929862, + "learning_rate": 0.0008047719079427834, + "loss": 0.83062017, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.18664551, + "step": 1625, + "time_per_iteration": 2.9942879676818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230131, + "balance_loss_mlp": 1.21601677, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.048676192852424666, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75581837, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.14160156, + "step": 1626, + "time_per_iteration": 4.848233938217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108724, + "balance_loss_mlp": 1.08925653, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.0694146578244244, + "language_loss": 0.86078912, + "learning_rate": 0.0008042777196091757, + "loss": 0.87187636, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.19458008, + "step": 1627, + "time_per_iteration": 2.701900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116209, + "balance_loss_mlp": 1.09631276, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.08749628678496815, + "language_loss": 0.81888652, + "learning_rate": 0.0008040304481977643, + "loss": 0.83004862, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.19885254, + "step": 1628, + "time_per_iteration": 2.696526527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138715, + "balance_loss_mlp": 1.11946249, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.07447099765210985, + "language_loss": 0.8675555, + "learning_rate": 0.0008037830587512649, + "loss": 0.87894267, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.19250488, + "step": 1629, + "time_per_iteration": 3.0616016387939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134253, + "balance_loss_mlp": 1.11413062, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.09771619875867958, + "language_loss": 0.78561771, + "learning_rate": 0.0008035355513657224, + "loss": 0.79696023, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.20117188, + "step": 1630, + "time_per_iteration": 2.4754045009613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137016, + "balance_loss_mlp": 1.11708379, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.08006054346576318, + "language_loss": 0.9267844, + "learning_rate": 0.0008032879261372279, + "loss": 0.93815458, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.19921875, + "step": 1631, + "time_per_iteration": 2.802116870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162193, + "balance_loss_mlp": 1.14845991, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.027777304949473513, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80798036, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.13769531, + "step": 1632, + "time_per_iteration": 5.508919715881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119807, + "balance_loss_mlp": 1.10029221, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.0647776963699187, + "language_loss": 0.86985779, + "learning_rate": 0.0008027923225359748, + "loss": 0.88105589, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.19506836, + "step": 1633, + "time_per_iteration": 2.600407600402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108986, + "balance_loss_mlp": 1.08867252, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.07494658582155435, + "language_loss": 0.87969911, + "learning_rate": 0.0008025443443556267, + "loss": 0.89078891, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.20300293, + "step": 1634, + "time_per_iteration": 2.721635103225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103961, + "balance_loss_mlp": 1.08468509, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.09628820684288855, + "language_loss": 0.88015246, + "learning_rate": 0.000802296248717147, + "loss": 0.89119208, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.19262695, + "step": 1635, + "time_per_iteration": 2.94401478767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090786, + "balance_loss_mlp": 1.07087779, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.07971253455476307, + "language_loss": 0.78918988, + "learning_rate": 0.0008020480357168554, + "loss": 0.8000977, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.19897461, + "step": 1636, + "time_per_iteration": 2.863992691040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089663, + "balance_loss_mlp": 1.07011271, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.07737806088204505, + "language_loss": 0.87917638, + "learning_rate": 0.0008017997054511165, + "loss": 0.890073, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.1953125, + "step": 1637, + "time_per_iteration": 2.586543083190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087265, + "balance_loss_mlp": 1.06765532, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.08038806705740831, + "language_loss": 0.85134554, + "learning_rate": 0.0008015512580163407, + "loss": 0.86221826, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.19592285, + "step": 1638, + "time_per_iteration": 2.8016490936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.06364322, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.07403915674476273, + "language_loss": 0.80143899, + "learning_rate": 0.0008013026935089838, + "loss": 0.81228203, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.20666504, + "step": 1639, + "time_per_iteration": 2.906219244003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086238, + "balance_loss_mlp": 1.06543589, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.08080644571808258, + "language_loss": 0.83962494, + "learning_rate": 0.0008010540120255472, + "loss": 0.85048735, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.20788574, + "step": 1640, + "time_per_iteration": 2.6874494552612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093894, + "balance_loss_mlp": 1.07238901, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.10412897550370145, + "language_loss": 0.85903674, + "learning_rate": 0.0008008052136625774, + "loss": 0.86997569, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.21508789, + "step": 1641, + "time_per_iteration": 2.806689977645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101865, + "balance_loss_mlp": 1.08080053, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.07569050828740802, + "language_loss": 0.86666101, + "learning_rate": 0.0008005562985166666, + "loss": 0.87767971, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.21069336, + "step": 1642, + "time_per_iteration": 2.7800753116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109644, + "balance_loss_mlp": 1.08823395, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.05889143992207802, + "language_loss": 0.85174221, + "learning_rate": 0.0008003072666844524, + "loss": 0.86283863, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.21411133, + "step": 1643, + "time_per_iteration": 2.722987651824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122613, + "balance_loss_mlp": 1.10185909, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.0837642836105996, + "language_loss": 0.82220256, + "learning_rate": 0.0008000581182626173, + "loss": 0.83342868, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.20751953, + "step": 1644, + "time_per_iteration": 2.5624425411224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143306, + "balance_loss_mlp": 1.12279046, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.21399278605623545, + "language_loss": 0.85377562, + "learning_rate": 0.0007998088533478894, + "loss": 0.86520875, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.2052002, + "step": 1645, + "time_per_iteration": 2.657808542251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.09847164, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.1165927047614104, + "language_loss": 0.83989012, + "learning_rate": 0.000799559472037042, + "loss": 0.85107368, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.19873047, + "step": 1646, + "time_per_iteration": 2.5764071941375732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101642, + "balance_loss_mlp": 1.08161449, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.06134983371250154, + "language_loss": 0.87497842, + "learning_rate": 0.0007993099744268932, + "loss": 0.88599485, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.20031738, + "step": 1647, + "time_per_iteration": 2.9123756885528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094967, + "balance_loss_mlp": 1.07502329, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.08774031682584008, + "language_loss": 0.87840933, + "learning_rate": 0.000799060360614307, + "loss": 0.889359, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.19934082, + "step": 1648, + "time_per_iteration": 2.7346584796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089439, + "balance_loss_mlp": 1.06954336, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.07558157708493889, + "language_loss": 0.8330996, + "learning_rate": 0.0007988106306961917, + "loss": 0.84399396, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.19885254, + "step": 1649, + "time_per_iteration": 3.1326329708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091589, + "balance_loss_mlp": 1.07182384, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.0875083493892423, + "language_loss": 0.84519339, + "learning_rate": 0.0007985607847695014, + "loss": 0.85610926, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.19750977, + "step": 1650, + "time_per_iteration": 2.689587354660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087502, + "balance_loss_mlp": 1.06813097, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.10331276722207645, + "language_loss": 0.82647395, + "learning_rate": 0.0007983108229312345, + "loss": 0.83734906, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.19348145, + "step": 1651, + "time_per_iteration": 2.935060501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094077, + "balance_loss_mlp": 1.07493234, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.08920057207213788, + "language_loss": 0.86297011, + "learning_rate": 0.0007980607452784351, + "loss": 0.8739109, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.19128418, + "step": 1652, + "time_per_iteration": 2.5893616676330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090057, + "balance_loss_mlp": 1.07070947, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.10003790987475829, + "language_loss": 0.90127802, + "learning_rate": 0.0007978105519081919, + "loss": 0.91217864, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.1932373, + "step": 1653, + "time_per_iteration": 2.7026524543762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091886, + "balance_loss_mlp": 1.07306278, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.08393617058492224, + "language_loss": 0.87581307, + "learning_rate": 0.0007975602429176385, + "loss": 0.88673192, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.18811035, + "step": 1654, + "time_per_iteration": 2.652863025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110285, + "balance_loss_mlp": 1.08389616, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.08283763038644905, + "language_loss": 0.8141948, + "learning_rate": 0.0007973098184039536, + "loss": 0.82522333, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.18933105, + "step": 1655, + "time_per_iteration": 2.658590316772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113313, + "balance_loss_mlp": 1.09477568, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.08159903981201219, + "language_loss": 0.86618698, + "learning_rate": 0.0007970592784643602, + "loss": 0.87732017, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.18518066, + "step": 1656, + "time_per_iteration": 2.892390251159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138107, + "balance_loss_mlp": 1.11967695, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.07828329710087445, + "language_loss": 0.84808218, + "learning_rate": 0.0007968086231961272, + "loss": 0.85946327, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.18432617, + "step": 1657, + "time_per_iteration": 2.659250497817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169742, + "balance_loss_mlp": 1.15010786, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.1537731911276923, + "language_loss": 0.8331663, + "learning_rate": 0.0007965578526965671, + "loss": 0.84486371, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.19616699, + "step": 1658, + "time_per_iteration": 2.6129345893859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115739, + "balance_loss_mlp": 1.13819742, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.07993574913147765, + "language_loss": 0.86468869, + "learning_rate": 0.0007963069670630377, + "loss": 0.87626261, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.19189453, + "step": 1659, + "time_per_iteration": 2.735495090484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150627, + "balance_loss_mlp": 1.13118374, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.07695546581371572, + "language_loss": 0.87941194, + "learning_rate": 0.0007960559663929416, + "loss": 0.8909182, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.19421387, + "step": 1660, + "time_per_iteration": 2.6464481353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144507, + "balance_loss_mlp": 1.12452734, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.0701260521045673, + "language_loss": 0.87574112, + "learning_rate": 0.0007958048507837259, + "loss": 0.88718617, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.19995117, + "step": 1661, + "time_per_iteration": 2.964620590209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135721, + "balance_loss_mlp": 1.11478782, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.08820049354030167, + "language_loss": 0.87464488, + "learning_rate": 0.0007955536203328822, + "loss": 0.88600206, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.20947266, + "step": 1662, + "time_per_iteration": 2.9402856826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128807, + "balance_loss_mlp": 1.10893452, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.0703581314218412, + "language_loss": 0.83491433, + "learning_rate": 0.0007953022751379469, + "loss": 0.84620237, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.1986084, + "step": 1663, + "time_per_iteration": 2.8694913387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133101, + "balance_loss_mlp": 1.11183429, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.07762769933283196, + "language_loss": 0.81855732, + "learning_rate": 0.000795050815296501, + "loss": 0.82988834, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.21264648, + "step": 1664, + "time_per_iteration": 2.9839534759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133942, + "balance_loss_mlp": 1.11387873, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.06538130148842129, + "language_loss": 0.92802906, + "learning_rate": 0.0007947992409061695, + "loss": 0.93936849, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.20068359, + "step": 1665, + "time_per_iteration": 2.600677013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128395, + "balance_loss_mlp": 1.10815299, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07570782620206934, + "language_loss": 0.86083347, + "learning_rate": 0.0007945475520646226, + "loss": 0.8721174, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.20227051, + "step": 1666, + "time_per_iteration": 2.960444211959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126888, + "balance_loss_mlp": 1.10798109, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.08296696017450861, + "language_loss": 0.84656757, + "learning_rate": 0.0007942957488695743, + "loss": 0.85783648, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.18908691, + "step": 1667, + "time_per_iteration": 2.671600341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131636, + "balance_loss_mlp": 1.11284864, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06557982969248469, + "language_loss": 0.80884814, + "learning_rate": 0.0007940438314187833, + "loss": 0.82016456, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.18774414, + "step": 1668, + "time_per_iteration": 3.0618937015533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129602, + "balance_loss_mlp": 1.11102939, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.08496063360517363, + "language_loss": 0.80308306, + "learning_rate": 0.0007937917998100529, + "loss": 0.8143791, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.18566895, + "step": 1669, + "time_per_iteration": 2.6219253540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139247, + "balance_loss_mlp": 1.12098432, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.07361411804364891, + "language_loss": 0.78932178, + "learning_rate": 0.0007935396541412302, + "loss": 0.80071419, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.18273926, + "step": 1670, + "time_per_iteration": 2.6380372047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148818, + "balance_loss_mlp": 1.13088846, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.07283292072888313, + "language_loss": 0.85630834, + "learning_rate": 0.0007932873945102068, + "loss": 0.86779654, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.17932129, + "step": 1671, + "time_per_iteration": 2.6828458309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171107, + "balance_loss_mlp": 1.15642071, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.02887484158654099, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76932883, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.14648438, + "step": 1672, + "time_per_iteration": 4.8265416622161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160939, + "balance_loss_mlp": 1.14286733, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.07500648032395062, + "language_loss": 0.86484933, + "learning_rate": 0.0007927825337533461, + "loss": 0.87645876, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.18078613, + "step": 1673, + "time_per_iteration": 2.7402546405792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155615, + "balance_loss_mlp": 1.1377933, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.10786589074132553, + "language_loss": 0.84594876, + "learning_rate": 0.0007925299328235131, + "loss": 0.8575049, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.17822266, + "step": 1674, + "time_per_iteration": 2.663360118865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149086, + "balance_loss_mlp": 1.13095438, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.09107412637612472, + "language_loss": 0.84947217, + "learning_rate": 0.000792277218323488, + "loss": 0.86096299, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.18139648, + "step": 1675, + "time_per_iteration": 2.608579158782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136338, + "balance_loss_mlp": 1.11837292, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.07405590971136047, + "language_loss": 0.84631819, + "learning_rate": 0.0007920243903513833, + "loss": 0.85768151, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.17956543, + "step": 1676, + "time_per_iteration": 2.598543882369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128075, + "balance_loss_mlp": 1.10991931, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.08030295134522303, + "language_loss": 0.83944809, + "learning_rate": 0.0007917714490053556, + "loss": 0.85072881, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.18164062, + "step": 1677, + "time_per_iteration": 2.6944823265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126784, + "balance_loss_mlp": 1.10863996, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.06747924585348261, + "language_loss": 0.86233467, + "learning_rate": 0.0007915183943836055, + "loss": 0.87360251, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.18164062, + "step": 1678, + "time_per_iteration": 2.9165165424346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120975, + "balance_loss_mlp": 1.10280752, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.11051827421234449, + "language_loss": 0.84204686, + "learning_rate": 0.0007912652265843773, + "loss": 0.85325664, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.18164062, + "step": 1679, + "time_per_iteration": 3.141361713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108875, + "balance_loss_mlp": 1.09056485, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.06834343380772315, + "language_loss": 0.81678128, + "learning_rate": 0.0007910119457059597, + "loss": 0.82787001, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.1829834, + "step": 1680, + "time_per_iteration": 2.7235679626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097161, + "balance_loss_mlp": 1.07836151, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.08108919878534793, + "language_loss": 0.80109823, + "learning_rate": 0.0007907585518466849, + "loss": 0.81206989, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.18798828, + "step": 1681, + "time_per_iteration": 2.9778435230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096448, + "balance_loss_mlp": 1.07823253, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.07179806444318433, + "language_loss": 0.89356047, + "learning_rate": 0.000790505045104929, + "loss": 0.90452492, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.18200684, + "step": 1682, + "time_per_iteration": 2.522502899169922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092596, + "balance_loss_mlp": 1.07453537, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.07276753556485034, + "language_loss": 0.86845744, + "learning_rate": 0.0007902514255791125, + "loss": 0.87938344, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.18066406, + "step": 1683, + "time_per_iteration": 2.7951602935791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094831, + "balance_loss_mlp": 1.07612705, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.14328213003802046, + "language_loss": 0.87945193, + "learning_rate": 0.0007899976933676986, + "loss": 0.89040023, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.18701172, + "step": 1684, + "time_per_iteration": 3.0410313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095322, + "balance_loss_mlp": 1.0759027, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.09505530250353386, + "language_loss": 0.8717491, + "learning_rate": 0.0007897438485691955, + "loss": 0.88270235, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.19396973, + "step": 1685, + "time_per_iteration": 2.717643976211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109744, + "balance_loss_mlp": 1.09030128, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.0737580177172555, + "language_loss": 0.82153177, + "learning_rate": 0.0007894898912821542, + "loss": 0.8326292, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.19433594, + "step": 1686, + "time_per_iteration": 2.529229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103459, + "balance_loss_mlp": 1.0848738, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.06566778614017829, + "language_loss": 0.86626494, + "learning_rate": 0.0007892358216051695, + "loss": 0.87729949, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.18566895, + "step": 1687, + "time_per_iteration": 2.7486979961395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103486, + "balance_loss_mlp": 1.08472204, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.06759540868164342, + "language_loss": 0.91712224, + "learning_rate": 0.0007889816396368803, + "loss": 0.92815715, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.18737793, + "step": 1688, + "time_per_iteration": 2.6558406352996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114869, + "balance_loss_mlp": 1.09629631, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.08904939998236257, + "language_loss": 0.85158062, + "learning_rate": 0.0007887273454759687, + "loss": 0.86272931, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.18566895, + "step": 1689, + "time_per_iteration": 2.4704487323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120516, + "balance_loss_mlp": 1.10219383, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.07572457526068059, + "language_loss": 0.82346898, + "learning_rate": 0.0007884729392211603, + "loss": 0.83467412, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.18322754, + "step": 1690, + "time_per_iteration": 2.703683614730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110893, + "balance_loss_mlp": 1.09243917, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09550307140961752, + "language_loss": 0.85592222, + "learning_rate": 0.0007882184209712245, + "loss": 0.86703116, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.18444824, + "step": 1691, + "time_per_iteration": 2.560342788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103123, + "balance_loss_mlp": 1.0847528, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.06639873617663411, + "language_loss": 0.85215127, + "learning_rate": 0.000787963790824974, + "loss": 0.86318254, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.18371582, + "step": 1692, + "time_per_iteration": 3.01053786277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102989, + "balance_loss_mlp": 1.08483362, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.0791061376464097, + "language_loss": 0.89282072, + "learning_rate": 0.0007877090488812651, + "loss": 0.90385056, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.18164062, + "step": 1693, + "time_per_iteration": 2.4398083686828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101181, + "balance_loss_mlp": 1.08242917, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.07726533895166562, + "language_loss": 0.8386811, + "learning_rate": 0.0007874541952389973, + "loss": 0.84969294, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.1875, + "step": 1694, + "time_per_iteration": 2.6756813526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104451, + "balance_loss_mlp": 1.08591402, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.08042259552829657, + "language_loss": 0.86563015, + "learning_rate": 0.0007871992299971136, + "loss": 0.87667465, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.1854248, + "step": 1695, + "time_per_iteration": 2.5899436473846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114554, + "balance_loss_mlp": 1.096017, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.10859344338562153, + "language_loss": 0.84131289, + "learning_rate": 0.0007869441532546001, + "loss": 0.85245848, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.18530273, + "step": 1696, + "time_per_iteration": 2.7561304569244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107684, + "balance_loss_mlp": 1.08946884, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.10465149109525512, + "language_loss": 0.79480183, + "learning_rate": 0.0007866889651104867, + "loss": 0.8058787, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.18225098, + "step": 1697, + "time_per_iteration": 2.8031740188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108686, + "balance_loss_mlp": 1.08992255, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.0906406666849178, + "language_loss": 0.83109629, + "learning_rate": 0.000786433665663846, + "loss": 0.84218317, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.18762207, + "step": 1698, + "time_per_iteration": 2.6932730674743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_mlp": 1.08788502, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.09684550827651525, + "language_loss": 0.86934984, + "learning_rate": 0.0007861782550137942, + "loss": 0.88041353, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.18481445, + "step": 1699, + "time_per_iteration": 2.924246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111141, + "balance_loss_mlp": 1.09345734, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.08559105168392155, + "language_loss": 0.85866642, + "learning_rate": 0.0007859227332594901, + "loss": 0.86978048, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.17956543, + "step": 1700, + "time_per_iteration": 2.930842876434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106023, + "balance_loss_mlp": 1.0883081, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.09580530814462011, + "language_loss": 0.84299338, + "learning_rate": 0.0007856671005001365, + "loss": 0.85405362, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.17712402, + "step": 1701, + "time_per_iteration": 3.2081515789031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110257, + "balance_loss_mlp": 1.09185123, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.08565892816740808, + "language_loss": 0.81811458, + "learning_rate": 0.0007854113568349787, + "loss": 0.8292172, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.18408203, + "step": 1702, + "time_per_iteration": 3.1229259967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107177, + "balance_loss_mlp": 1.08861589, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.07794566968546403, + "language_loss": 0.80742395, + "learning_rate": 0.0007851555023633052, + "loss": 0.81849575, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.18554688, + "step": 1703, + "time_per_iteration": 2.87683367729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093514, + "balance_loss_mlp": 1.07504809, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.08579630919656539, + "language_loss": 0.82316363, + "learning_rate": 0.0007848995371844474, + "loss": 0.83409876, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.18469238, + "step": 1704, + "time_per_iteration": 2.543123483657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108458, + "balance_loss_mlp": 1.09000456, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.08180134109500492, + "language_loss": 0.80497056, + "learning_rate": 0.0007846434613977801, + "loss": 0.81605512, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.18444824, + "step": 1705, + "time_per_iteration": 2.5694901943206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096099, + "balance_loss_mlp": 1.07726395, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.08642702147252447, + "language_loss": 0.7816267, + "learning_rate": 0.0007843872751027203, + "loss": 0.79258776, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.18835449, + "step": 1706, + "time_per_iteration": 2.8476855754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091232, + "balance_loss_mlp": 1.07206345, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.07466271413415602, + "language_loss": 0.87096149, + "learning_rate": 0.0007841309783987287, + "loss": 0.88187379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.19152832, + "step": 1707, + "time_per_iteration": 2.752048969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.0709219, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.08448532304164387, + "language_loss": 0.8909331, + "learning_rate": 0.0007838745713853084, + "loss": 0.90183651, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.1940918, + "step": 1708, + "time_per_iteration": 2.576037883758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085126, + "balance_loss_mlp": 1.06595731, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.08173004229220915, + "language_loss": 0.84132832, + "learning_rate": 0.0007836180541620053, + "loss": 0.85217953, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.19152832, + "step": 1709, + "time_per_iteration": 2.7169644832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084228, + "balance_loss_mlp": 1.06489253, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.09936539185168088, + "language_loss": 0.86458898, + "learning_rate": 0.0007833614268284082, + "loss": 0.8754313, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.19311523, + "step": 1710, + "time_per_iteration": 2.5532357692718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119417, + "balance_loss_mlp": 1.17919695, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.0502772245871811, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75303936, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.14941406, + "step": 1711, + "time_per_iteration": 4.93800163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084968, + "balance_loss_mlp": 1.06610942, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.0930127101012754, + "language_loss": 0.78468674, + "learning_rate": 0.0007828478422289016, + "loss": 0.7955364, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.18835449, + "step": 1712, + "time_per_iteration": 2.6106202602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094707, + "balance_loss_mlp": 1.0755266, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.07722441463790092, + "language_loss": 0.88823062, + "learning_rate": 0.0007825908851623833, + "loss": 0.89917773, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.19165039, + "step": 1713, + "time_per_iteration": 2.7708652019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099793, + "balance_loss_mlp": 1.08030224, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.08538102567636462, + "language_loss": 0.84563339, + "learning_rate": 0.0007823338183843533, + "loss": 0.85663128, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.19482422, + "step": 1714, + "time_per_iteration": 2.6919374465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101813, + "balance_loss_mlp": 1.08302569, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.10472435712491576, + "language_loss": 0.80579829, + "learning_rate": 0.0007820766419946141, + "loss": 0.81681645, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.18762207, + "step": 1715, + "time_per_iteration": 3.3962650299072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133772, + "balance_loss_mlp": 1.12051618, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.022367363269540627, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80806249, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.1328125, + "step": 1716, + "time_per_iteration": 4.940594434738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117089, + "balance_loss_mlp": 1.0989933, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.07989503427724588, + "language_loss": 0.7557565, + "learning_rate": 0.0007815619607794288, + "loss": 0.76692742, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.1809082, + "step": 1717, + "time_per_iteration": 2.6619300842285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112079, + "balance_loss_mlp": 1.10175252, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.08732146715249756, + "language_loss": 0.82213569, + "learning_rate": 0.0007813044561538001, + "loss": 0.83334363, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.19030762, + "step": 1718, + "time_per_iteration": 3.146427869796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118808, + "balance_loss_mlp": 1.0996747, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.07987567281751332, + "language_loss": 0.88114393, + "learning_rate": 0.0007810468423160958, + "loss": 0.89233208, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.19128418, + "step": 1719, + "time_per_iteration": 2.882783889770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116955, + "balance_loss_mlp": 1.09883487, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.07516231806962957, + "language_loss": 0.81837869, + "learning_rate": 0.0007807891193663306, + "loss": 0.82954824, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.18127441, + "step": 1720, + "time_per_iteration": 2.817091464996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115372, + "balance_loss_mlp": 1.09681106, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.08207921946386207, + "language_loss": 0.82360268, + "learning_rate": 0.0007805312874045614, + "loss": 0.83475637, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.18566895, + "step": 1721, + "time_per_iteration": 2.5788111686706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127428, + "balance_loss_mlp": 1.10856915, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.08587725731854692, + "language_loss": 0.86701787, + "learning_rate": 0.0007802733465308874, + "loss": 0.87829208, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.18847656, + "step": 1722, + "time_per_iteration": 2.47092866897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134903, + "balance_loss_mlp": 1.11681938, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.07875115394989439, + "language_loss": 0.84537411, + "learning_rate": 0.0007800152968454501, + "loss": 0.85672319, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.1809082, + "step": 1723, + "time_per_iteration": 2.689821481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134288, + "balance_loss_mlp": 1.1161443, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.07553816314554183, + "language_loss": 0.90259147, + "learning_rate": 0.0007797571384484334, + "loss": 0.91393435, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.18139648, + "step": 1724, + "time_per_iteration": 2.881140947341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130284, + "balance_loss_mlp": 1.11211705, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.09124178304656469, + "language_loss": 0.91919303, + "learning_rate": 0.0007794988714400633, + "loss": 0.93049586, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.18164062, + "step": 1725, + "time_per_iteration": 2.6405282020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127051, + "balance_loss_mlp": 1.10823941, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.08426272849970545, + "language_loss": 0.85092092, + "learning_rate": 0.0007792404959206079, + "loss": 0.8621915, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.18798828, + "step": 1726, + "time_per_iteration": 2.5432610511779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127088, + "balance_loss_mlp": 1.1084559, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.07425680572728817, + "language_loss": 0.81119555, + "learning_rate": 0.0007789820119903774, + "loss": 0.82246637, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.1862793, + "step": 1727, + "time_per_iteration": 3.032222270965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139545, + "balance_loss_mlp": 1.12562108, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.028014537923784853, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79632211, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.13964844, + "step": 1728, + "time_per_iteration": 4.8402745723724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_mlp": 1.11797178, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.0895292490434253, + "language_loss": 0.8341223, + "learning_rate": 0.0007784647192990428, + "loss": 0.84549034, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.18798828, + "step": 1729, + "time_per_iteration": 2.732290267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138805, + "balance_loss_mlp": 1.11925435, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.13711052560491443, + "language_loss": 0.80506217, + "learning_rate": 0.0007782059107387696, + "loss": 0.81645024, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.1953125, + "step": 1730, + "time_per_iteration": 2.8793182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114255, + "balance_loss_mlp": 1.12199879, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.08825875418673053, + "language_loss": 0.8822093, + "learning_rate": 0.0007779469941693826, + "loss": 0.8936348, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.20556641, + "step": 1731, + "time_per_iteration": 2.862053632736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136737, + "balance_loss_mlp": 1.11668622, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.0849632369239172, + "language_loss": 0.77099073, + "learning_rate": 0.0007776879696914029, + "loss": 0.78235817, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.20043945, + "step": 1732, + "time_per_iteration": 2.878997325897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137118, + "balance_loss_mlp": 1.11639929, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.06630968591745413, + "language_loss": 0.88863558, + "learning_rate": 0.000777428837405392, + "loss": 0.90000677, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.20715332, + "step": 1733, + "time_per_iteration": 2.849579095840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113514, + "balance_loss_mlp": 1.1140877, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.1678685499329745, + "language_loss": 0.86820018, + "learning_rate": 0.0007771695974119544, + "loss": 0.87955153, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.21069336, + "step": 1734, + "time_per_iteration": 2.5213568210601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011438, + "balance_loss_mlp": 1.12223458, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.07580918658919847, + "language_loss": 0.75353694, + "learning_rate": 0.0007769102498117359, + "loss": 0.76497495, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.21569824, + "step": 1735, + "time_per_iteration": 3.1764426231384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152046, + "balance_loss_mlp": 1.12946832, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.07940235688963863, + "language_loss": 0.79215956, + "learning_rate": 0.000776650794705424, + "loss": 0.80368006, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.22570801, + "step": 1736, + "time_per_iteration": 3.311570644378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150562, + "balance_loss_mlp": 1.12822187, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.07154101803961593, + "language_loss": 0.82120311, + "learning_rate": 0.0007763912321937483, + "loss": 0.83270872, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.22351074, + "step": 1737, + "time_per_iteration": 2.7742059230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162448, + "balance_loss_mlp": 1.14046574, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.09893982821491046, + "language_loss": 0.82392818, + "learning_rate": 0.0007761315623774799, + "loss": 0.83555263, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.21972656, + "step": 1738, + "time_per_iteration": 3.4311513900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158796, + "balance_loss_mlp": 1.1368016, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.09029538875627986, + "language_loss": 0.87794083, + "learning_rate": 0.0007758717853574313, + "loss": 0.88952881, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.22009277, + "step": 1739, + "time_per_iteration": 2.771195411682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165102, + "balance_loss_mlp": 1.14437175, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.0906994231642372, + "language_loss": 0.89945674, + "learning_rate": 0.0007756119012344571, + "loss": 0.91110778, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.20739746, + "step": 1740, + "time_per_iteration": 2.60304594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150213, + "balance_loss_mlp": 1.12998307, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.09292231464176055, + "language_loss": 0.8424325, + "learning_rate": 0.0007753519101094535, + "loss": 0.85393465, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.20227051, + "step": 1741, + "time_per_iteration": 2.763831377029419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130901, + "balance_loss_mlp": 1.11101699, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.09107418087972757, + "language_loss": 0.86003816, + "learning_rate": 0.0007750918120833575, + "loss": 0.87134719, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.19873047, + "step": 1742, + "time_per_iteration": 2.5983192920684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110821, + "balance_loss_mlp": 1.08914852, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.08951756084527424, + "language_loss": 0.86919558, + "learning_rate": 0.0007748316072571485, + "loss": 0.88027763, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.19042969, + "step": 1743, + "time_per_iteration": 2.826857328414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096104, + "balance_loss_mlp": 1.07641089, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.07101368717418235, + "language_loss": 0.78953618, + "learning_rate": 0.0007745712957318467, + "loss": 0.80049723, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.19677734, + "step": 1744, + "time_per_iteration": 2.9848310947418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099368, + "balance_loss_mlp": 1.08075917, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06728871536655502, + "language_loss": 0.86402392, + "learning_rate": 0.0007743108776085141, + "loss": 0.87501758, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.18603516, + "step": 1745, + "time_per_iteration": 2.7903690338134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100918, + "balance_loss_mlp": 1.08167791, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.08105774730722601, + "language_loss": 0.83074069, + "learning_rate": 0.0007740503529882543, + "loss": 0.84174985, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.19238281, + "step": 1746, + "time_per_iteration": 2.8164098262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102514, + "balance_loss_mlp": 1.08327341, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.08939656691142209, + "language_loss": 0.90720791, + "learning_rate": 0.0007737897219722114, + "loss": 0.91823304, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.19226074, + "step": 1747, + "time_per_iteration": 2.682877540588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098067, + "balance_loss_mlp": 1.07800448, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.08976840313873562, + "language_loss": 0.81010032, + "learning_rate": 0.0007735289846615716, + "loss": 0.82108104, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.20068359, + "step": 1748, + "time_per_iteration": 2.687856674194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096768, + "balance_loss_mlp": 1.07715857, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.08605901070846078, + "language_loss": 0.81949353, + "learning_rate": 0.0007732681411575621, + "loss": 0.83046126, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.19616699, + "step": 1749, + "time_per_iteration": 2.711014747619629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100357, + "balance_loss_mlp": 1.08002043, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.0865041685268045, + "language_loss": 0.87347746, + "learning_rate": 0.0007730071915614514, + "loss": 0.88448107, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.20349121, + "step": 1750, + "time_per_iteration": 2.7877442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097656, + "balance_loss_mlp": 1.07754588, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.099917727371098, + "language_loss": 0.88751096, + "learning_rate": 0.0007727461359745489, + "loss": 0.89848751, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.2010498, + "step": 1751, + "time_per_iteration": 2.5344979763031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110101, + "balance_loss_mlp": 1.09051538, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.06874041131201088, + "language_loss": 0.85970122, + "learning_rate": 0.0007724849744982056, + "loss": 0.87080222, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.19592285, + "step": 1752, + "time_per_iteration": 2.7278292179107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118351, + "balance_loss_mlp": 1.09820437, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.07532767444648983, + "language_loss": 0.81245279, + "learning_rate": 0.0007722237072338131, + "loss": 0.82363629, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.20141602, + "step": 1753, + "time_per_iteration": 2.715123414993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129336, + "balance_loss_mlp": 1.10946393, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.09907858659655516, + "language_loss": 0.85174322, + "learning_rate": 0.0007719623342828046, + "loss": 0.86303657, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.1986084, + "step": 1754, + "time_per_iteration": 2.580603837966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011433, + "balance_loss_mlp": 1.12336826, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.09468217220840029, + "language_loss": 0.84008503, + "learning_rate": 0.000771700855746654, + "loss": 0.85151798, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.19934082, + "step": 1755, + "time_per_iteration": 2.6360206604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115036, + "balance_loss_mlp": 1.13060665, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.06173278613548714, + "language_loss": 0.8813622, + "learning_rate": 0.0007714392717268763, + "loss": 0.89286578, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.19750977, + "step": 1756, + "time_per_iteration": 2.610471725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.14999521, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.08560719953811556, + "language_loss": 0.86437309, + "learning_rate": 0.0007711775823250273, + "loss": 0.87606871, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.19555664, + "step": 1757, + "time_per_iteration": 2.5406768321990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179489, + "balance_loss_mlp": 1.16010547, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.06814979795763555, + "language_loss": 0.82866555, + "learning_rate": 0.0007709157876427039, + "loss": 0.84046042, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.19372559, + "step": 1758, + "time_per_iteration": 3.144188642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152293, + "balance_loss_mlp": 1.13320732, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.08381425857535812, + "language_loss": 0.85356963, + "learning_rate": 0.0007706538877815439, + "loss": 0.86509264, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.1907959, + "step": 1759, + "time_per_iteration": 2.6544251441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145804, + "balance_loss_mlp": 1.12751722, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.07160952497477109, + "language_loss": 0.83250809, + "learning_rate": 0.0007703918828432259, + "loss": 0.84396613, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.18273926, + "step": 1760, + "time_per_iteration": 2.639800548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139561, + "balance_loss_mlp": 1.12061834, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.07528387784347967, + "language_loss": 0.89063478, + "learning_rate": 0.000770129772929469, + "loss": 0.90203035, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.18933105, + "step": 1761, + "time_per_iteration": 2.690807580947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143571, + "balance_loss_mlp": 1.12493849, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.07941213480930635, + "language_loss": 0.87791038, + "learning_rate": 0.0007698675581420334, + "loss": 0.88934612, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.18615723, + "step": 1762, + "time_per_iteration": 2.897935390472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135098, + "balance_loss_mlp": 1.11646509, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.08353352960784785, + "language_loss": 0.78453314, + "learning_rate": 0.0007696052385827199, + "loss": 0.79588407, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.18603516, + "step": 1763, + "time_per_iteration": 2.960893154144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144695, + "balance_loss_mlp": 1.12652755, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.0785374693184301, + "language_loss": 0.77934641, + "learning_rate": 0.00076934281435337, + "loss": 0.7907933, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.18188477, + "step": 1764, + "time_per_iteration": 2.8066813945770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131427, + "balance_loss_mlp": 1.11263931, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.11428683327792583, + "language_loss": 0.86483157, + "learning_rate": 0.0007690802855558658, + "loss": 0.87614584, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.18762207, + "step": 1765, + "time_per_iteration": 2.9382381439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097374, + "balance_loss_mlp": 1.08335495, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.038046821471630334, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77472329, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.140625, + "step": 1766, + "time_per_iteration": 4.939141750335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131665, + "balance_loss_mlp": 1.11155438, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.1972001158351392, + "language_loss": 0.89103919, + "learning_rate": 0.0007685549146641262, + "loss": 0.90235579, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.20117188, + "step": 1767, + "time_per_iteration": 2.596677780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113815, + "balance_loss_mlp": 1.11898088, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.0754052007703104, + "language_loss": 0.87994409, + "learning_rate": 0.0007682920727738579, + "loss": 0.89132559, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.19152832, + "step": 1768, + "time_per_iteration": 2.572606325149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011476, + "balance_loss_mlp": 1.12763298, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.09008834675764238, + "language_loss": 0.84476101, + "learning_rate": 0.000768029126723369, + "loss": 0.85623699, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.19958496, + "step": 1769, + "time_per_iteration": 2.517974615097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117614, + "balance_loss_mlp": 1.15621972, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.08324416055939475, + "language_loss": 0.81926113, + "learning_rate": 0.0007677660766147447, + "loss": 0.83102256, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.19909668, + "step": 1770, + "time_per_iteration": 2.525979518890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.0996542, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.058076344856887535, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73584139, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.13574219, + "step": 1771, + "time_per_iteration": 4.954227924346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208192, + "balance_loss_mlp": 1.18773556, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.12544773614524246, + "language_loss": 0.79168922, + "learning_rate": 0.0007672396646316306, + "loss": 0.80377114, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.20446777, + "step": 1772, + "time_per_iteration": 2.5573487281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184404, + "balance_loss_mlp": 1.1633631, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.0812632702006711, + "language_loss": 0.80576169, + "learning_rate": 0.000766976302961512, + "loss": 0.81760573, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.21057129, + "step": 1773, + "time_per_iteration": 2.9981236457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174902, + "balance_loss_mlp": 1.15440965, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.08509012237921207, + "language_loss": 0.81078374, + "learning_rate": 0.0007667128376420003, + "loss": 0.82253277, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.20495605, + "step": 1774, + "time_per_iteration": 2.6422817707061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141783, + "balance_loss_mlp": 1.12135017, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.07609688435085656, + "language_loss": 0.84329826, + "learning_rate": 0.0007664492687753817, + "loss": 0.85471606, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.2043457, + "step": 1775, + "time_per_iteration": 2.719444513320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133962, + "balance_loss_mlp": 1.11357749, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.0684007600896635, + "language_loss": 0.81250805, + "learning_rate": 0.000766185596463983, + "loss": 0.82384765, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.20397949, + "step": 1776, + "time_per_iteration": 2.641289472579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118215, + "balance_loss_mlp": 1.09844995, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.08848921826202948, + "language_loss": 0.76858222, + "learning_rate": 0.0007659218208101706, + "loss": 0.77976441, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.19750977, + "step": 1777, + "time_per_iteration": 3.121042490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111411, + "balance_loss_mlp": 1.09507275, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.06446172596419028, + "language_loss": 0.84679043, + "learning_rate": 0.0007656579419163515, + "loss": 0.85793149, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.19018555, + "step": 1778, + "time_per_iteration": 2.8044042587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115799, + "balance_loss_mlp": 1.09639132, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.08419061749659096, + "language_loss": 0.7684586, + "learning_rate": 0.0007653939598849724, + "loss": 0.77961665, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.19396973, + "step": 1779, + "time_per_iteration": 2.5383636951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090316, + "balance_loss_mlp": 1.07667828, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.04688573866990776, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83970523, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.13671875, + "step": 1780, + "time_per_iteration": 4.939146041870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_mlp": 1.0817349, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.09328427377426286, + "language_loss": 0.7993626, + "learning_rate": 0.000764865686819522, + "loss": 0.81036985, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.18969727, + "step": 1781, + "time_per_iteration": 3.0855140686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097786, + "balance_loss_mlp": 1.07818818, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.0784117519331498, + "language_loss": 0.85829425, + "learning_rate": 0.0007646013959905449, + "loss": 0.86927211, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.19592285, + "step": 1782, + "time_per_iteration": 2.6008715629577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094184, + "balance_loss_mlp": 1.07484865, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.10020930760951015, + "language_loss": 0.80767882, + "learning_rate": 0.0007643370024341949, + "loss": 0.81862062, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.19311523, + "step": 1783, + "time_per_iteration": 3.1744794845581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093606, + "balance_loss_mlp": 1.0741868, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.06177623901241128, + "language_loss": 0.82775044, + "learning_rate": 0.0007640725062531195, + "loss": 0.83868653, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.19396973, + "step": 1784, + "time_per_iteration": 2.5207273960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095739, + "balance_loss_mlp": 1.07624829, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.07609738057692413, + "language_loss": 0.86137176, + "learning_rate": 0.0007638079075500047, + "loss": 0.87232918, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.19482422, + "step": 1785, + "time_per_iteration": 2.6027305126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_mlp": 1.02909327, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.02730093024075542, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76222348, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.12597656, + "step": 1786, + "time_per_iteration": 4.981709718704224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123604, + "balance_loss_mlp": 1.10412502, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.0828485615256838, + "language_loss": 0.82775986, + "learning_rate": 0.0007632784029886026, + "loss": 0.83899587, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.19470215, + "step": 1787, + "time_per_iteration": 2.6825647354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121547, + "balance_loss_mlp": 1.10167432, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.06541046205818803, + "language_loss": 0.84959292, + "learning_rate": 0.0007630134973358873, + "loss": 0.86080837, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.1986084, + "step": 1788, + "time_per_iteration": 3.0164642333984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112959, + "balance_loss_mlp": 1.11006355, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.07128557935976318, + "language_loss": 0.86626679, + "learning_rate": 0.0007627484895722763, + "loss": 0.8775627, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.1953125, + "step": 1789, + "time_per_iteration": 2.7014718055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134771, + "balance_loss_mlp": 1.11494648, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.08217230393347356, + "language_loss": 0.80139697, + "learning_rate": 0.0007624833798006552, + "loss": 0.81274474, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.19812012, + "step": 1790, + "time_per_iteration": 3.0889768600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130993, + "balance_loss_mlp": 1.11054873, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.08452412416329605, + "language_loss": 0.83807981, + "learning_rate": 0.0007622181681239483, + "loss": 0.84938967, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.20446777, + "step": 1791, + "time_per_iteration": 2.668236017227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126724, + "balance_loss_mlp": 1.10656524, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.06876002435899166, + "language_loss": 0.84450197, + "learning_rate": 0.0007619528546451202, + "loss": 0.85576922, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.20153809, + "step": 1792, + "time_per_iteration": 2.820676326751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10096347, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.0839228841992506, + "language_loss": 0.83888298, + "learning_rate": 0.0007616874394671745, + "loss": 0.8500948, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.20214844, + "step": 1793, + "time_per_iteration": 3.339189291000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121499, + "balance_loss_mlp": 1.10161519, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.08136840273622996, + "language_loss": 0.84983474, + "learning_rate": 0.0007614219226931547, + "loss": 0.86104971, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.19873047, + "step": 1794, + "time_per_iteration": 2.7227368354797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129707, + "balance_loss_mlp": 1.10958409, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.09590444489475901, + "language_loss": 0.84532511, + "learning_rate": 0.0007611563044261435, + "loss": 0.85662222, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.2010498, + "step": 1795, + "time_per_iteration": 2.546884536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125004, + "balance_loss_mlp": 1.10475039, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.0814281657370807, + "language_loss": 0.86456835, + "learning_rate": 0.0007608905847692631, + "loss": 0.87581837, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.20251465, + "step": 1796, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116976, + "balance_loss_mlp": 1.0972116, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.08445523119956015, + "language_loss": 0.86433315, + "learning_rate": 0.0007606247638256749, + "loss": 0.87550294, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.19750977, + "step": 1797, + "time_per_iteration": 2.8908944129943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_mlp": 1.03016257, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.0206101242754925, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79212284, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11816406, + "step": 1798, + "time_per_iteration": 4.959855079650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037793, + "balance_loss_mlp": 1.02591991, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.018708496865608985, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80365002, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11865234, + "step": 1799, + "time_per_iteration": 4.7935545444488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129126, + "balance_loss_mlp": 1.10934877, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.08973397272803926, + "language_loss": 0.85623878, + "learning_rate": 0.0007598266943068686, + "loss": 0.86753011, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.19763184, + "step": 1800, + "time_per_iteration": 2.8019869327545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112277, + "balance_loss_mlp": 1.10252821, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.0674943248051881, + "language_loss": 0.83542264, + "learning_rate": 0.0007595604692488507, + "loss": 0.84665036, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.20239258, + "step": 1801, + "time_per_iteration": 2.6360082626342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126397, + "balance_loss_mlp": 1.10636973, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.06909713253641608, + "language_loss": 0.82839429, + "learning_rate": 0.0007592941434205215, + "loss": 0.83965826, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.20031738, + "step": 1802, + "time_per_iteration": 2.8132333755493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015999, + "balance_loss_mlp": 1.0041256, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.010015114509230977, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74587059, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.11865234, + "step": 1803, + "time_per_iteration": 5.086339950561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104785, + "balance_loss_mlp": 1.08531845, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07426270940157376, + "language_loss": 0.80069757, + "learning_rate": 0.0007587611898665566, + "loss": 0.81174541, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.19458008, + "step": 1804, + "time_per_iteration": 3.092641592025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110039, + "balance_loss_mlp": 1.0910604, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.07581928055471668, + "language_loss": 0.81691384, + "learning_rate": 0.0007584945623478315, + "loss": 0.82801425, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.18969727, + "step": 1805, + "time_per_iteration": 2.846060037612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104021, + "balance_loss_mlp": 1.08541238, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.07473751481828116, + "language_loss": 0.80751228, + "learning_rate": 0.000758227834472617, + "loss": 0.81855249, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.18603516, + "step": 1806, + "time_per_iteration": 3.0771524906158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111397, + "balance_loss_mlp": 1.09499145, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.07117533522239076, + "language_loss": 0.77160984, + "learning_rate": 0.0007579610063444664, + "loss": 0.78274959, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.18969727, + "step": 1807, + "time_per_iteration": 2.765228509902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104162, + "balance_loss_mlp": 1.08548236, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.0766054024965894, + "language_loss": 0.8690778, + "learning_rate": 0.0007576940780669712, + "loss": 0.88011932, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.18664551, + "step": 1808, + "time_per_iteration": 3.279489278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123806, + "balance_loss_mlp": 1.10510182, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.07904928967380129, + "language_loss": 0.84151316, + "learning_rate": 0.0007574270497437624, + "loss": 0.85275126, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.18701172, + "step": 1809, + "time_per_iteration": 2.987900733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122089, + "balance_loss_mlp": 1.10336101, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.06962767524782593, + "language_loss": 0.87729847, + "learning_rate": 0.000757159921478509, + "loss": 0.88851929, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.18725586, + "step": 1810, + "time_per_iteration": 2.8426477909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055659, + "balance_loss_mlp": 1.04316616, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.023331363727236345, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75506294, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.125, + "step": 1811, + "time_per_iteration": 4.784373044967651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.12720931, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.0794635065049281, + "language_loss": 0.87678373, + "learning_rate": 0.0007566253655367423, + "loss": 0.88824427, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.18823242, + "step": 1812, + "time_per_iteration": 2.649627685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151883, + "balance_loss_mlp": 1.13314283, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.08948054068367119, + "language_loss": 0.89612782, + "learning_rate": 0.000756357938067762, + "loss": 0.90764666, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.18737793, + "step": 1813, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151317, + "balance_loss_mlp": 1.13220787, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.08322597535257283, + "language_loss": 0.82610291, + "learning_rate": 0.0007560904110718033, + "loss": 0.83761609, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.19104004, + "step": 1814, + "time_per_iteration": 3.2898061275482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124529, + "balance_loss_mlp": 1.10556281, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.08612147208900138, + "language_loss": 0.8345058, + "learning_rate": 0.0007558227846527297, + "loss": 0.84575117, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.1895752, + "step": 1815, + "time_per_iteration": 2.9130759239196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123683, + "balance_loss_mlp": 1.10491991, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.09988459790630169, + "language_loss": 0.83118773, + "learning_rate": 0.0007555550589144429, + "loss": 0.84242463, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.1875, + "step": 1816, + "time_per_iteration": 2.4752960205078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117728, + "balance_loss_mlp": 1.09804606, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.07751955343806295, + "language_loss": 0.84176993, + "learning_rate": 0.000755287233960883, + "loss": 0.85294718, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.19665527, + "step": 1817, + "time_per_iteration": 2.597585439682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098926, + "balance_loss_mlp": 1.07926798, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.08165217026076037, + "language_loss": 0.7746554, + "learning_rate": 0.0007550193098960292, + "loss": 0.78564465, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.19641113, + "step": 1818, + "time_per_iteration": 2.9257001876831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092408, + "balance_loss_mlp": 1.07195151, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.0691698669989475, + "language_loss": 0.85927546, + "learning_rate": 0.0007547512868238988, + "loss": 0.87019956, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.20446777, + "step": 1819, + "time_per_iteration": 3.1347925662994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108081, + "balance_loss_mlp": 1.06050837, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.09514158419007644, + "language_loss": 0.83275855, + "learning_rate": 0.0007544831648485473, + "loss": 0.84356666, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.20300293, + "step": 1820, + "time_per_iteration": 2.7215232849121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108343, + "balance_loss_mlp": 1.06210327, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.1073780855917388, + "language_loss": 0.81151676, + "learning_rate": 0.0007542149440740694, + "loss": 0.82235104, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.21350098, + "step": 1821, + "time_per_iteration": 2.6931724548339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080905, + "balance_loss_mlp": 1.05936432, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.1562262811893555, + "language_loss": 0.85392433, + "learning_rate": 0.000753946624604597, + "loss": 0.86473334, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.2154541, + "step": 1822, + "time_per_iteration": 2.7700464725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072173, + "balance_loss_mlp": 1.05028629, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.08427952696401207, + "language_loss": 0.87906677, + "learning_rate": 0.0007536782065443015, + "loss": 0.88978851, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.21899414, + "step": 1823, + "time_per_iteration": 2.618863105773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084137, + "balance_loss_mlp": 1.06188059, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.15781529291863344, + "language_loss": 0.75435269, + "learning_rate": 0.0007534096899973919, + "loss": 0.76519406, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.22253418, + "step": 1824, + "time_per_iteration": 2.5891709327697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086563, + "balance_loss_mlp": 1.06396103, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.09040612359289192, + "language_loss": 0.82346433, + "learning_rate": 0.0007531410750681154, + "loss": 0.83432996, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.22595215, + "step": 1825, + "time_per_iteration": 2.810972213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111588, + "balance_loss_mlp": 1.09455299, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.07292466952983544, + "language_loss": 0.86399037, + "learning_rate": 0.0007528723618607575, + "loss": 0.87514913, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.21325684, + "step": 1826, + "time_per_iteration": 3.474869966506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133984, + "balance_loss_mlp": 1.11370611, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.08837862995453269, + "language_loss": 0.82404733, + "learning_rate": 0.0007526035504796422, + "loss": 0.83538717, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.20275879, + "step": 1827, + "time_per_iteration": 2.8155739307403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150633, + "balance_loss_mlp": 1.13051069, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.10569988158542801, + "language_loss": 0.86735702, + "learning_rate": 0.0007523346410291312, + "loss": 0.87886333, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.20117188, + "step": 1828, + "time_per_iteration": 2.788748025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147917, + "balance_loss_mlp": 1.12691236, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.11718864183712574, + "language_loss": 0.84880495, + "learning_rate": 0.0007520656336136245, + "loss": 0.86028415, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.21020508, + "step": 1829, + "time_per_iteration": 2.995258331298828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144006, + "balance_loss_mlp": 1.12407422, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.07752679685559628, + "language_loss": 0.87776285, + "learning_rate": 0.0007517965283375599, + "loss": 0.88920295, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.19921875, + "step": 1830, + "time_per_iteration": 2.9131507873535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137113, + "balance_loss_mlp": 1.11694324, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.0712879308552529, + "language_loss": 0.89257503, + "learning_rate": 0.0007515273253054132, + "loss": 0.90394616, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.20166016, + "step": 1831, + "time_per_iteration": 2.7115964889526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144172, + "balance_loss_mlp": 1.12451458, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.08358912815272257, + "language_loss": 0.82353687, + "learning_rate": 0.0007512580246216988, + "loss": 0.83497858, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.19665527, + "step": 1832, + "time_per_iteration": 2.7660555839538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137261, + "balance_loss_mlp": 1.11740053, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.08932198209233742, + "language_loss": 0.84907162, + "learning_rate": 0.000750988626390968, + "loss": 0.86044419, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.19848633, + "step": 1833, + "time_per_iteration": 2.6142635345458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135258, + "balance_loss_mlp": 1.11577928, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.0712191508208571, + "language_loss": 0.84978765, + "learning_rate": 0.0007507191307178108, + "loss": 0.86114025, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.19470215, + "step": 1834, + "time_per_iteration": 2.8424935340881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124766, + "balance_loss_mlp": 1.10512066, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.12990441969076433, + "language_loss": 0.74422562, + "learning_rate": 0.0007504495377068543, + "loss": 0.75547332, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.19628906, + "step": 1835, + "time_per_iteration": 2.8079066276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129638, + "balance_loss_mlp": 1.11026645, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09183665723882013, + "language_loss": 0.81276792, + "learning_rate": 0.0007501798474627642, + "loss": 0.82406431, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.19360352, + "step": 1836, + "time_per_iteration": 2.952760934829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120111, + "balance_loss_mlp": 1.10109687, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.11181895830758388, + "language_loss": 0.83497429, + "learning_rate": 0.0007499100600902433, + "loss": 0.84617543, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.18994141, + "step": 1837, + "time_per_iteration": 3.0599989891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112032, + "balance_loss_mlp": 1.09237409, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.08618493176537427, + "language_loss": 0.84243816, + "learning_rate": 0.0007496401756940324, + "loss": 0.85355854, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.1965332, + "step": 1838, + "time_per_iteration": 2.7366483211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111194, + "balance_loss_mlp": 1.09217548, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.1107744559232423, + "language_loss": 0.82783937, + "learning_rate": 0.0007493701943789098, + "loss": 0.8389588, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.19750977, + "step": 1839, + "time_per_iteration": 2.780212640762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.08844888, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07955024359155173, + "language_loss": 0.82622725, + "learning_rate": 0.000749100116249692, + "loss": 0.83730406, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.19213867, + "step": 1840, + "time_per_iteration": 2.59558367729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110996, + "balance_loss_mlp": 1.09009957, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.09363875008830587, + "language_loss": 0.86041892, + "learning_rate": 0.0007488299414112321, + "loss": 0.87151849, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.19848633, + "step": 1841, + "time_per_iteration": 2.625204563140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112719, + "balance_loss_mlp": 1.0932045, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.07784236461393054, + "language_loss": 0.77495539, + "learning_rate": 0.0007485596699684215, + "loss": 0.78608257, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.19506836, + "step": 1842, + "time_per_iteration": 2.889179229736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110913, + "balance_loss_mlp": 1.0890193, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.0730470956511186, + "language_loss": 0.85287404, + "learning_rate": 0.000748289302026189, + "loss": 0.86396539, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.2010498, + "step": 1843, + "time_per_iteration": 2.8508758544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117167, + "balance_loss_mlp": 1.09693718, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.08361202953284802, + "language_loss": 0.85558116, + "learning_rate": 0.0007480188376895004, + "loss": 0.8667528, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.20227051, + "step": 1844, + "time_per_iteration": 3.0799713134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058665, + "balance_loss_mlp": 1.04655302, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.036648944322370085, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74870002, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.12109375, + "step": 1845, + "time_per_iteration": 4.911001205444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151869, + "balance_loss_mlp": 1.1320442, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08485938300722028, + "language_loss": 0.78214371, + "learning_rate": 0.0007474776202528074, + "loss": 0.79366243, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.19824219, + "step": 1846, + "time_per_iteration": 3.0216140747070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161099, + "balance_loss_mlp": 1.1411432, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08254469932015057, + "language_loss": 0.81304067, + "learning_rate": 0.000747206867362922, + "loss": 0.82465172, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.19946289, + "step": 1847, + "time_per_iteration": 3.090902090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160243, + "balance_loss_mlp": 1.13996506, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.07042821685917994, + "language_loss": 0.83881712, + "learning_rate": 0.0007469360184988194, + "loss": 0.85041958, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.20275879, + "step": 1848, + "time_per_iteration": 2.834099292755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164664, + "balance_loss_mlp": 1.14419615, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08278620993607219, + "language_loss": 0.86537004, + "learning_rate": 0.0007466650737656518, + "loss": 0.87701666, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.20471191, + "step": 1849, + "time_per_iteration": 2.6372272968292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164893, + "balance_loss_mlp": 1.14411473, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.1003606576453008, + "language_loss": 0.90052241, + "learning_rate": 0.0007463940332686098, + "loss": 0.9121713, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.20788574, + "step": 1850, + "time_per_iteration": 2.485778331756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138299, + "balance_loss_mlp": 1.11759257, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.07662996022318802, + "language_loss": 0.83963442, + "learning_rate": 0.0007461228971129205, + "loss": 0.85101742, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.20715332, + "step": 1851, + "time_per_iteration": 2.9709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119617, + "balance_loss_mlp": 1.09905326, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.09722709387095821, + "language_loss": 0.8525731, + "learning_rate": 0.0007458516654038483, + "loss": 0.86376923, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.20568848, + "step": 1852, + "time_per_iteration": 2.678692579269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122543, + "balance_loss_mlp": 1.10156226, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.11064851070237179, + "language_loss": 0.86565018, + "learning_rate": 0.0007455803382466946, + "loss": 0.87687564, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.20983887, + "step": 1853, + "time_per_iteration": 2.8357412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118205, + "balance_loss_mlp": 1.0977726, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.07486516106338226, + "language_loss": 0.87089902, + "learning_rate": 0.0007453089157467979, + "loss": 0.88208103, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.2043457, + "step": 1854, + "time_per_iteration": 2.808497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110339, + "balance_loss_mlp": 1.08300531, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.0938349401282225, + "language_loss": 0.82008994, + "learning_rate": 0.0007450373980095341, + "loss": 0.83112389, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.20385742, + "step": 1855, + "time_per_iteration": 3.127755641937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102136, + "balance_loss_mlp": 1.08226347, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.07357008991516471, + "language_loss": 0.86741251, + "learning_rate": 0.0007447657851403155, + "loss": 0.87843382, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.1986084, + "step": 1856, + "time_per_iteration": 2.662548780441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104026, + "balance_loss_mlp": 1.08421302, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.09605793543255373, + "language_loss": 0.78325486, + "learning_rate": 0.0007444940772445915, + "loss": 0.79429507, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.19812012, + "step": 1857, + "time_per_iteration": 2.7455575466156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098079, + "balance_loss_mlp": 1.07937515, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.09380435326028273, + "language_loss": 0.80025625, + "learning_rate": 0.0007442222744278484, + "loss": 0.81123704, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.18688965, + "step": 1858, + "time_per_iteration": 2.7159781455993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110587, + "balance_loss_mlp": 1.08752322, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.07197173632554923, + "language_loss": 0.8371805, + "learning_rate": 0.0007439503767956099, + "loss": 0.84823918, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.18347168, + "step": 1859, + "time_per_iteration": 2.7746405601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129586, + "balance_loss_mlp": 1.11757004, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.053548748661834844, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80801189, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12011719, + "step": 1860, + "time_per_iteration": 4.952972412109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141933, + "balance_loss_mlp": 1.12300301, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.07146029040980974, + "language_loss": 0.86061597, + "learning_rate": 0.000743406297506922, + "loss": 0.87203526, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.18920898, + "step": 1861, + "time_per_iteration": 2.788799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155472, + "balance_loss_mlp": 1.13686371, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.08496046226468609, + "language_loss": 0.83806807, + "learning_rate": 0.0007431341160617031, + "loss": 0.84962279, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.18615723, + "step": 1862, + "time_per_iteration": 2.891972780227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153957, + "balance_loss_mlp": 1.13561106, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.08024798355603865, + "language_loss": 0.87945759, + "learning_rate": 0.0007428618402234491, + "loss": 0.89099711, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.18347168, + "step": 1863, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157244, + "balance_loss_mlp": 1.13868272, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.10629387801358743, + "language_loss": 0.79862851, + "learning_rate": 0.0007425894700978668, + "loss": 0.81020093, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.18579102, + "step": 1864, + "time_per_iteration": 2.80774188041687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153191, + "balance_loss_mlp": 1.13476086, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.07530240473897643, + "language_loss": 0.79704821, + "learning_rate": 0.0007423170057906996, + "loss": 0.80858016, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.1842041, + "step": 1865, + "time_per_iteration": 3.8680994510650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145718, + "balance_loss_mlp": 1.12701416, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.09184761749378255, + "language_loss": 0.86028153, + "learning_rate": 0.0007420444474077275, + "loss": 0.87173867, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.18688965, + "step": 1866, + "time_per_iteration": 2.5685620307922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113899, + "balance_loss_mlp": 1.12003553, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.09893409220555562, + "language_loss": 0.89461643, + "learning_rate": 0.0007417717950547671, + "loss": 0.90600634, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.18945312, + "step": 1867, + "time_per_iteration": 2.671124219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107889, + "balance_loss_mlp": 1.06611049, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.038408778239575524, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77075499, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.12792969, + "step": 1868, + "time_per_iteration": 4.9185333251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122572, + "balance_loss_mlp": 1.10416651, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.07553494616843248, + "language_loss": 0.84798276, + "learning_rate": 0.0007412262088623299, + "loss": 0.85920852, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.18408203, + "step": 1869, + "time_per_iteration": 2.7392468452453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.10186732, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.08536155576366684, + "language_loss": 0.79418659, + "learning_rate": 0.0007409532752346684, + "loss": 0.80538857, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.18334961, + "step": 1870, + "time_per_iteration": 2.696479082107544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119342, + "balance_loss_mlp": 1.10078073, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.06482127106924716, + "language_loss": 0.88322479, + "learning_rate": 0.0007406802480606491, + "loss": 0.89441818, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.18566895, + "step": 1871, + "time_per_iteration": 2.636009931564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125376, + "balance_loss_mlp": 1.1068871, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.08328980109467413, + "language_loss": 0.90382409, + "learning_rate": 0.0007404071274462707, + "loss": 0.91507781, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.18493652, + "step": 1872, + "time_per_iteration": 2.6033034324645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126247, + "balance_loss_mlp": 1.10767388, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.08507135616363887, + "language_loss": 0.83713084, + "learning_rate": 0.0007401339134975682, + "loss": 0.84839332, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.18579102, + "step": 1873, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124337, + "balance_loss_mlp": 1.1061461, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.08710024588150622, + "language_loss": 0.8447001, + "learning_rate": 0.0007398606063206122, + "loss": 0.8559435, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.18200684, + "step": 1874, + "time_per_iteration": 2.6102805137634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118797, + "balance_loss_mlp": 1.1010226, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.09331990326127676, + "language_loss": 0.78271621, + "learning_rate": 0.0007395872060215101, + "loss": 0.79390419, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.17773438, + "step": 1875, + "time_per_iteration": 2.6235439777374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125165, + "balance_loss_mlp": 1.10746276, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.08705098996186143, + "language_loss": 0.8794744, + "learning_rate": 0.0007393137127064056, + "loss": 0.89072609, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.17724609, + "step": 1876, + "time_per_iteration": 2.693005323410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131992, + "balance_loss_mlp": 1.11434913, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.07970542462566557, + "language_loss": 0.84223264, + "learning_rate": 0.0007390401264814779, + "loss": 0.85355258, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.17675781, + "step": 1877, + "time_per_iteration": 2.6267154216766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144153, + "balance_loss_mlp": 1.12600899, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.11052243492945069, + "language_loss": 0.84164327, + "learning_rate": 0.0007387664474529427, + "loss": 0.8530848, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.18151855, + "step": 1878, + "time_per_iteration": 2.6380414962768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114366, + "balance_loss_mlp": 1.12561202, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.06785614970382317, + "language_loss": 0.91167343, + "learning_rate": 0.0007384926757270518, + "loss": 0.92311001, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.18054199, + "step": 1879, + "time_per_iteration": 2.6760640144348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148828, + "balance_loss_mlp": 1.13057721, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.07379174248702317, + "language_loss": 0.79513329, + "learning_rate": 0.0007382188114100924, + "loss": 0.80662155, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.18249512, + "step": 1880, + "time_per_iteration": 2.980865716934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140419, + "balance_loss_mlp": 1.12196517, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.08452869991753884, + "language_loss": 0.81477511, + "learning_rate": 0.0007379448546083884, + "loss": 0.82617927, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.18457031, + "step": 1881, + "time_per_iteration": 2.9168553352355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122618, + "balance_loss_mlp": 1.10411692, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.07446388495521607, + "language_loss": 0.87973779, + "learning_rate": 0.0007376708054282992, + "loss": 0.89096403, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.18481445, + "step": 1882, + "time_per_iteration": 2.987179756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115299, + "balance_loss_mlp": 1.09675002, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.06334344400813875, + "language_loss": 0.83726645, + "learning_rate": 0.0007373966639762201, + "loss": 0.84841949, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.18530273, + "step": 1883, + "time_per_iteration": 2.611685276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107737, + "balance_loss_mlp": 1.08896196, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.34913247510054485, + "language_loss": 0.88361132, + "learning_rate": 0.0007371224303585822, + "loss": 0.89468867, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.18762207, + "step": 1884, + "time_per_iteration": 2.5775835514068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_mlp": 1.04219282, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.031056792089232132, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81412423, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.13183594, + "step": 1885, + "time_per_iteration": 4.700505256652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125656, + "balance_loss_mlp": 1.10721421, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.08679320645386224, + "language_loss": 0.82572937, + "learning_rate": 0.0007365736870525335, + "loss": 0.83698595, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.18457031, + "step": 1886, + "time_per_iteration": 2.859740734100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129292, + "balance_loss_mlp": 1.11139846, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.08795223769340633, + "language_loss": 0.82107997, + "learning_rate": 0.000736299177577164, + "loss": 0.8323729, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.17907715, + "step": 1887, + "time_per_iteration": 2.5841786861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130014, + "balance_loss_mlp": 1.11198997, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.08315005772253937, + "language_loss": 0.83388066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84518075, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.18029785, + "step": 1888, + "time_per_iteration": 2.665529489517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.12729573, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.088670630002398, + "language_loss": 0.89456129, + "learning_rate": 0.0007357498835146039, + "loss": 0.90601313, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.17895508, + "step": 1889, + "time_per_iteration": 2.8847129344940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156911, + "balance_loss_mlp": 1.13911295, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.10357296063524607, + "language_loss": 0.87070376, + "learning_rate": 0.0007354750991406684, + "loss": 0.8822729, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.17810059, + "step": 1890, + "time_per_iteration": 2.723062753677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159624, + "balance_loss_mlp": 1.14133692, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.08144896750451855, + "language_loss": 0.80397975, + "learning_rate": 0.0007352002233471919, + "loss": 0.81557596, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.18310547, + "step": 1891, + "time_per_iteration": 2.6574442386627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175761, + "balance_loss_mlp": 1.15818954, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.12092525276023756, + "language_loss": 0.79267627, + "learning_rate": 0.0007349252562408906, + "loss": 0.80443388, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.17590332, + "step": 1892, + "time_per_iteration": 2.7125816345214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180182, + "balance_loss_mlp": 1.16231263, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.10164191197483487, + "language_loss": 0.81473255, + "learning_rate": 0.0007346501979285158, + "loss": 0.82653439, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.17883301, + "step": 1893, + "time_per_iteration": 2.902371406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069378, + "balance_loss_mlp": 1.05621696, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.029928407037273664, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.8160848, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.13183594, + "step": 1894, + "time_per_iteration": 4.841979265213013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166903, + "balance_loss_mlp": 1.14858055, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.079124644393009, + "language_loss": 0.85946983, + "learning_rate": 0.0007340998081127308, + "loss": 0.87113881, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.18322754, + "step": 1895, + "time_per_iteration": 2.7981679439544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149308, + "balance_loss_mlp": 1.13090205, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.08117131709807607, + "language_loss": 0.90645039, + "learning_rate": 0.0007338244768230007, + "loss": 0.91794348, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.18408203, + "step": 1896, + "time_per_iteration": 2.821958541870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131855, + "balance_loss_mlp": 1.11337733, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.06648659114179455, + "language_loss": 0.88624144, + "learning_rate": 0.0007335490547545578, + "loss": 0.89756, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.18469238, + "step": 1897, + "time_per_iteration": 3.0718753337860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115822, + "balance_loss_mlp": 1.09670115, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06783762736794967, + "language_loss": 0.82265627, + "learning_rate": 0.0007332735420143308, + "loss": 0.8338145, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.19091797, + "step": 1898, + "time_per_iteration": 2.7864439487457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103004, + "balance_loss_mlp": 1.08431149, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.10561663647405507, + "language_loss": 0.86410689, + "learning_rate": 0.0007329979387092826, + "loss": 0.87513697, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.18664551, + "step": 1899, + "time_per_iteration": 2.6032557487487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099553, + "balance_loss_mlp": 1.08087325, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.0619875823145499, + "language_loss": 0.83878422, + "learning_rate": 0.0007327222449464124, + "loss": 0.84977973, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.18676758, + "step": 1900, + "time_per_iteration": 3.2741036415100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103796, + "balance_loss_mlp": 1.08450782, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07856096432694096, + "language_loss": 0.885158, + "learning_rate": 0.0007324464608327538, + "loss": 0.89619601, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.19287109, + "step": 1901, + "time_per_iteration": 2.678788900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094923, + "balance_loss_mlp": 1.07613552, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.117877128585243, + "language_loss": 0.88101745, + "learning_rate": 0.0007321705864753758, + "loss": 0.8919667, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.18774414, + "step": 1902, + "time_per_iteration": 2.746980905532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104989, + "balance_loss_mlp": 1.08645177, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.07495737234387592, + "language_loss": 0.83840346, + "learning_rate": 0.0007318946219813823, + "loss": 0.84945333, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.18530273, + "step": 1903, + "time_per_iteration": 3.0181055068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113516, + "balance_loss_mlp": 1.09416842, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.08147269799104237, + "language_loss": 0.89553183, + "learning_rate": 0.000731618567457912, + "loss": 0.90666699, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.19335938, + "step": 1904, + "time_per_iteration": 2.656008243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112169, + "balance_loss_mlp": 1.10242581, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.09666599698156476, + "language_loss": 0.86684108, + "learning_rate": 0.000731342423012139, + "loss": 0.87805796, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.19250488, + "step": 1905, + "time_per_iteration": 3.0675060749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130977, + "balance_loss_mlp": 1.11136723, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.07693711099894461, + "language_loss": 0.82752407, + "learning_rate": 0.0007310661887512722, + "loss": 0.83883387, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.19616699, + "step": 1906, + "time_per_iteration": 3.058940887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121537, + "balance_loss_mlp": 1.10290504, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.08447106182036945, + "language_loss": 0.8153969, + "learning_rate": 0.0007307898647825549, + "loss": 0.82661223, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.1862793, + "step": 1907, + "time_per_iteration": 2.6844449043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123702, + "balance_loss_mlp": 1.10468769, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.09351646457276126, + "language_loss": 0.89255947, + "learning_rate": 0.0007305134512132659, + "loss": 0.90379649, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.18994141, + "step": 1908, + "time_per_iteration": 2.709672451019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110565, + "balance_loss_mlp": 1.09136009, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.10593037141853442, + "language_loss": 0.82889271, + "learning_rate": 0.0007302369481507183, + "loss": 0.83999836, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.19189453, + "step": 1909, + "time_per_iteration": 2.521117687225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042583, + "balance_loss_mlp": 1.03214002, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.025696927495133286, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81004339, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10449219, + "step": 1910, + "time_per_iteration": 4.8944993019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109603, + "balance_loss_mlp": 1.09143519, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.13197556892024634, + "language_loss": 0.85332, + "learning_rate": 0.000729683673975274, + "loss": 0.864416, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.18164062, + "step": 1911, + "time_per_iteration": 2.6855151653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113177, + "balance_loss_mlp": 1.09509254, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.05917682500902713, + "language_loss": 0.82910979, + "learning_rate": 0.0007294069030771774, + "loss": 0.84024155, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.1809082, + "step": 1912, + "time_per_iteration": 3.696908712387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119134, + "balance_loss_mlp": 1.10070467, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.2785371066278341, + "language_loss": 0.90901196, + "learning_rate": 0.0007291300431154224, + "loss": 0.92020327, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.18432617, + "step": 1913, + "time_per_iteration": 2.666469097137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066964, + "balance_loss_mlp": 1.05699825, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.035296075115353785, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71456701, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.09960938, + "step": 1914, + "time_per_iteration": 5.019417762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176446, + "balance_loss_mlp": 1.1579566, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.09302167105112862, + "language_loss": 0.79388487, + "learning_rate": 0.0007285760564309179, + "loss": 0.80564928, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.18493652, + "step": 1915, + "time_per_iteration": 3.112898826599121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204501, + "balance_loss_mlp": 1.18492651, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.10352341742670183, + "language_loss": 0.84420514, + "learning_rate": 0.0007282989299232448, + "loss": 0.85625011, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.19567871, + "step": 1916, + "time_per_iteration": 3.0435094833374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222721, + "balance_loss_mlp": 1.20364785, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.07568711881104075, + "language_loss": 0.83658814, + "learning_rate": 0.0007280217147820668, + "loss": 0.84881544, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.19042969, + "step": 1917, + "time_per_iteration": 2.618802547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214339, + "balance_loss_mlp": 1.19502735, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06430089788192027, + "language_loss": 0.78882575, + "learning_rate": 0.0007277444111150079, + "loss": 0.80096912, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.19299316, + "step": 1918, + "time_per_iteration": 2.705514669418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212887, + "balance_loss_mlp": 1.19302678, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.1316988542142886, + "language_loss": 0.84107184, + "learning_rate": 0.0007274670190297272, + "loss": 0.85320067, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.19848633, + "step": 1919, + "time_per_iteration": 2.643360137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216839, + "balance_loss_mlp": 1.19697857, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.08424482176176182, + "language_loss": 0.82129955, + "learning_rate": 0.0007271895386339179, + "loss": 0.83346796, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.19848633, + "step": 1920, + "time_per_iteration": 2.7766342163085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209662, + "balance_loss_mlp": 1.1898967, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.08336147686301533, + "language_loss": 0.83142531, + "learning_rate": 0.0007269119700353073, + "loss": 0.84352195, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.19763184, + "step": 1921, + "time_per_iteration": 2.747455596923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217254, + "balance_loss_mlp": 1.19840705, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.06910916264284567, + "language_loss": 0.85129571, + "learning_rate": 0.0007266343133416571, + "loss": 0.86346817, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.18811035, + "step": 1922, + "time_per_iteration": 2.815875768661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107275, + "balance_loss_mlp": 1.09573579, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.04105564932095409, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78224194, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11523438, + "step": 1923, + "time_per_iteration": 4.86853289604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198125, + "balance_loss_mlp": 1.17899168, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.1110881339245658, + "language_loss": 0.84574348, + "learning_rate": 0.0007260787361004556, + "loss": 0.85772473, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.19128418, + "step": 1924, + "time_per_iteration": 2.580287456512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.0494777, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.023148070033358246, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74822283, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11279297, + "step": 1925, + "time_per_iteration": 4.9416913986206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175905, + "balance_loss_mlp": 1.15692663, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.06834955035904498, + "language_loss": 0.87516356, + "learning_rate": 0.0007255228077730903, + "loss": 0.8869226, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.18969727, + "step": 1926, + "time_per_iteration": 2.7211105823516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176426, + "balance_loss_mlp": 1.15784156, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.06265343241116231, + "language_loss": 0.81563449, + "learning_rate": 0.0007252447122218632, + "loss": 0.82739878, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.18579102, + "step": 1927, + "time_per_iteration": 3.151231527328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172974, + "balance_loss_mlp": 1.15472341, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.09894828359622332, + "language_loss": 0.88063776, + "learning_rate": 0.0007249665292228834, + "loss": 0.89236754, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.18261719, + "step": 1928, + "time_per_iteration": 2.702021360397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173693, + "balance_loss_mlp": 1.1554302, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.08781668530165682, + "language_loss": 0.83526367, + "learning_rate": 0.000724688258884151, + "loss": 0.8470006, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.18249512, + "step": 1929, + "time_per_iteration": 2.560795783996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162512, + "balance_loss_mlp": 1.14461839, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.07372740974795068, + "language_loss": 0.86387187, + "learning_rate": 0.0007244099013137002, + "loss": 0.87549698, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.17907715, + "step": 1930, + "time_per_iteration": 3.090304374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153317, + "balance_loss_mlp": 1.1359247, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.07369885077257772, + "language_loss": 0.88680494, + "learning_rate": 0.0007241314566195993, + "loss": 0.89833808, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.17407227, + "step": 1931, + "time_per_iteration": 3.2688889503479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140069, + "balance_loss_mlp": 1.12190151, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.1370251830388882, + "language_loss": 0.85430074, + "learning_rate": 0.0007238529249099496, + "loss": 0.86570138, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.18164062, + "step": 1932, + "time_per_iteration": 2.6766042709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056936, + "balance_loss_mlp": 1.04673159, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.03186229248255652, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78913808, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.10205078, + "step": 1933, + "time_per_iteration": 4.938454866409302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121071, + "balance_loss_mlp": 1.10291553, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.0858411932854742, + "language_loss": 0.80716681, + "learning_rate": 0.000723295600876581, + "loss": 0.81837749, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.1817627, + "step": 1934, + "time_per_iteration": 3.02756404876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127839, + "balance_loss_mlp": 1.10930252, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.07598025600252532, + "language_loss": 0.87578201, + "learning_rate": 0.0007230168087692344, + "loss": 0.8870604, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.18530273, + "step": 1935, + "time_per_iteration": 2.6842763423919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117422, + "balance_loss_mlp": 1.09867072, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.07638826910824403, + "language_loss": 0.82760978, + "learning_rate": 0.0007227379300790839, + "loss": 0.83878398, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.1875, + "step": 1936, + "time_per_iteration": 3.028691530227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126722, + "balance_loss_mlp": 1.10711217, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.1377793635442251, + "language_loss": 0.85613376, + "learning_rate": 0.0007224589649143997, + "loss": 0.86740094, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.19604492, + "step": 1937, + "time_per_iteration": 2.5564050674438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129561, + "balance_loss_mlp": 1.11017799, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.07798966628460335, + "language_loss": 0.80875593, + "learning_rate": 0.0007221799133834861, + "loss": 0.82005155, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.19360352, + "step": 1938, + "time_per_iteration": 2.6535797119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128571, + "balance_loss_mlp": 1.10997486, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.20771096851505863, + "language_loss": 0.81190193, + "learning_rate": 0.00072190077559468, + "loss": 0.82318759, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.18591309, + "step": 1939, + "time_per_iteration": 2.5281853675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119884, + "balance_loss_mlp": 1.10124016, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.07206730115622964, + "language_loss": 0.89147639, + "learning_rate": 0.0007216215516563527, + "loss": 0.90267527, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.18640137, + "step": 1940, + "time_per_iteration": 2.7357096672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112047, + "balance_loss_mlp": 1.1024456, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.09123969930056855, + "language_loss": 0.839782, + "learning_rate": 0.0007213422416769083, + "loss": 0.8509866, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.18029785, + "step": 1941, + "time_per_iteration": 2.6104605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119536, + "balance_loss_mlp": 1.10109389, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.07207094919122449, + "language_loss": 0.75049472, + "learning_rate": 0.0007210628457647849, + "loss": 0.76169002, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.18444824, + "step": 1942, + "time_per_iteration": 2.5805821418762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.11117733, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.10610035509825085, + "language_loss": 0.78376162, + "learning_rate": 0.000720783364028453, + "loss": 0.79505277, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.17956543, + "step": 1943, + "time_per_iteration": 2.780245542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140529, + "balance_loss_mlp": 1.1218369, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.07224730964326329, + "language_loss": 0.87268645, + "learning_rate": 0.0007205037965764177, + "loss": 0.88409173, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.18688965, + "step": 1944, + "time_per_iteration": 2.5735671520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151311, + "balance_loss_mlp": 1.13291705, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07659834869138271, + "language_loss": 0.8526088, + "learning_rate": 0.0007202241435172161, + "loss": 0.86412191, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.18408203, + "step": 1945, + "time_per_iteration": 2.7935566902160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126679, + "balance_loss_mlp": 1.10871434, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.3794268789868596, + "language_loss": 0.88413203, + "learning_rate": 0.0007199444049594198, + "loss": 0.89539886, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.17956543, + "step": 1946, + "time_per_iteration": 2.995715379714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127316, + "balance_loss_mlp": 1.10844493, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.0746444377907342, + "language_loss": 0.83035469, + "learning_rate": 0.0007196645810116322, + "loss": 0.8416279, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.1887207, + "step": 1947, + "time_per_iteration": 2.766355037689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142049, + "balance_loss_mlp": 1.12292802, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.07850495494132069, + "language_loss": 0.83822554, + "learning_rate": 0.0007193846717824912, + "loss": 0.84964609, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.19104004, + "step": 1948, + "time_per_iteration": 2.925459623336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133743, + "balance_loss_mlp": 1.11488414, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.08022476151722048, + "language_loss": 0.88327885, + "learning_rate": 0.0007191046773806669, + "loss": 0.89461625, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.18859863, + "step": 1949, + "time_per_iteration": 2.5894553661346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123414, + "balance_loss_mlp": 1.10373282, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.08918312945621011, + "language_loss": 0.83225584, + "learning_rate": 0.0007188245979148631, + "loss": 0.84349, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.19665527, + "step": 1950, + "time_per_iteration": 3.159851551055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126856, + "balance_loss_mlp": 1.1067214, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.11158799296642749, + "language_loss": 0.87878865, + "learning_rate": 0.0007185444334938157, + "loss": 0.89005721, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.20129395, + "step": 1951, + "time_per_iteration": 2.7033133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111192, + "balance_loss_mlp": 1.09180903, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.09975748916923241, + "language_loss": 0.8500011, + "learning_rate": 0.0007182641842262947, + "loss": 0.86111307, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.19372559, + "step": 1952, + "time_per_iteration": 2.626728057861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108209, + "balance_loss_mlp": 1.08878958, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.09334076595597436, + "language_loss": 0.77694595, + "learning_rate": 0.0007179838502211022, + "loss": 0.78802806, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.19421387, + "step": 1953, + "time_per_iteration": 2.8748068809509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106763, + "balance_loss_mlp": 1.08678353, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.0737363931585354, + "language_loss": 0.86213845, + "learning_rate": 0.0007177034315870738, + "loss": 0.87320614, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.19970703, + "step": 1954, + "time_per_iteration": 2.961113929748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110959, + "balance_loss_mlp": 1.08933675, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.08944632819393537, + "language_loss": 0.91041321, + "learning_rate": 0.0007174229284330773, + "loss": 0.92150909, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.20239258, + "step": 1955, + "time_per_iteration": 2.6537580490112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113343, + "balance_loss_mlp": 1.09273195, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.10287168416480917, + "language_loss": 0.86629105, + "learning_rate": 0.0007171423408680141, + "loss": 0.87742448, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.20605469, + "step": 1956, + "time_per_iteration": 2.814793348312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106345, + "balance_loss_mlp": 1.08584106, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.10543893351617999, + "language_loss": 0.89721847, + "learning_rate": 0.0007168616690008176, + "loss": 0.90828192, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.20495605, + "step": 1957, + "time_per_iteration": 2.6851284503936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098402, + "balance_loss_mlp": 1.07823181, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.08262297472790796, + "language_loss": 0.85860795, + "learning_rate": 0.0007165809129404545, + "loss": 0.86959195, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.20166016, + "step": 1958, + "time_per_iteration": 2.756485939025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.08695424, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.08262792958454514, + "language_loss": 0.85935986, + "learning_rate": 0.0007163000727959239, + "loss": 0.87042725, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.19775391, + "step": 1959, + "time_per_iteration": 2.525435447692871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070977, + "balance_loss_mlp": 1.06053388, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.03547956764144784, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79030049, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.10449219, + "step": 1960, + "time_per_iteration": 4.89080286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149436, + "balance_loss_mlp": 1.13035011, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.06578569091259368, + "language_loss": 0.84412438, + "learning_rate": 0.00071573814069052, + "loss": 0.85561872, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.19067383, + "step": 1961, + "time_per_iteration": 2.9070186614990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173736, + "balance_loss_mlp": 1.15444791, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.18582927476215966, + "language_loss": 0.87659955, + "learning_rate": 0.0007154570489478081, + "loss": 0.8883369, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.19274902, + "step": 1962, + "time_per_iteration": 3.2049379348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173644, + "balance_loss_mlp": 1.15447557, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.14724331795419812, + "language_loss": 0.86293024, + "learning_rate": 0.0007151758735572514, + "loss": 0.87466669, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.19152832, + "step": 1963, + "time_per_iteration": 3.0349316596984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142067, + "balance_loss_mlp": 1.12338686, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.0939989250476118, + "language_loss": 0.80074733, + "learning_rate": 0.0007148946146280119, + "loss": 0.812168, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.18676758, + "step": 1964, + "time_per_iteration": 2.8144431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048428, + "balance_loss_mlp": 1.03836632, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.021748901232604565, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73240578, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.10058594, + "step": 1965, + "time_per_iteration": 4.930070400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055709, + "balance_loss_mlp": 1.04559994, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.023739163757957975, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76397657, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.10107422, + "step": 1966, + "time_per_iteration": 4.934873580932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137343, + "balance_loss_mlp": 1.11776876, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.08213272343580422, + "language_loss": 0.83802509, + "learning_rate": 0.0007140503377003022, + "loss": 0.84939849, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.19555664, + "step": 1967, + "time_per_iteration": 3.0881879329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139491, + "balance_loss_mlp": 1.11967874, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.1174729362064234, + "language_loss": 0.84845448, + "learning_rate": 0.000713768745708599, + "loss": 0.85984945, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.19799805, + "step": 1968, + "time_per_iteration": 2.635103225708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150253, + "balance_loss_mlp": 1.12999952, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.12024050748438767, + "language_loss": 0.77237123, + "learning_rate": 0.0007134870707245085, + "loss": 0.7838738, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.20251465, + "step": 1969, + "time_per_iteration": 3.2765696048736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137246, + "balance_loss_mlp": 1.11786246, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.12719814054785675, + "language_loss": 0.84604537, + "learning_rate": 0.0007132053128573864, + "loss": 0.85741782, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.19372559, + "step": 1970, + "time_per_iteration": 2.741464614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134845, + "balance_loss_mlp": 1.11534226, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.07594331821705162, + "language_loss": 0.83660662, + "learning_rate": 0.0007129234722166211, + "loss": 0.84795505, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.19482422, + "step": 1971, + "time_per_iteration": 2.879617214202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150531, + "balance_loss_mlp": 1.13185048, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.10702357186833415, + "language_loss": 0.90689349, + "learning_rate": 0.0007126415489116328, + "loss": 0.91839886, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.18676758, + "step": 1972, + "time_per_iteration": 2.7060065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177798, + "balance_loss_mlp": 1.15965438, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.08068810601979462, + "language_loss": 0.81252205, + "learning_rate": 0.0007123595430518736, + "loss": 0.82429999, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.18151855, + "step": 1973, + "time_per_iteration": 2.872903823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217278, + "balance_loss_mlp": 1.19866943, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.10171747912447733, + "language_loss": 0.86328602, + "learning_rate": 0.0007120774547468282, + "loss": 0.87545884, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.18591309, + "step": 1974, + "time_per_iteration": 2.5397889614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240679, + "balance_loss_mlp": 1.22244012, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.14549097169765346, + "language_loss": 0.81380564, + "learning_rate": 0.0007117952841060128, + "loss": 0.82621247, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.18249512, + "step": 1975, + "time_per_iteration": 2.6751859188079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203512, + "balance_loss_mlp": 1.18491578, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.08096849874764685, + "language_loss": 0.8358916, + "learning_rate": 0.0007115130312389756, + "loss": 0.84792668, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.18579102, + "step": 1976, + "time_per_iteration": 2.6997742652893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194849, + "balance_loss_mlp": 1.17584705, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.0836403104795401, + "language_loss": 0.78931224, + "learning_rate": 0.0007112306962552973, + "loss": 0.80126077, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.18994141, + "step": 1977, + "time_per_iteration": 2.6066653728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177391, + "balance_loss_mlp": 1.15869951, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.0835848576107689, + "language_loss": 0.84830624, + "learning_rate": 0.0007109482792645896, + "loss": 0.86008012, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.18676758, + "step": 1978, + "time_per_iteration": 2.7217793464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163855, + "balance_loss_mlp": 1.14444792, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.18446881037378643, + "language_loss": 0.83627468, + "learning_rate": 0.0007106657803764969, + "loss": 0.84791327, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.19384766, + "step": 1979, + "time_per_iteration": 2.7421200275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142527, + "balance_loss_mlp": 1.12388265, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.07567906441681438, + "language_loss": 0.81599772, + "learning_rate": 0.0007103831997006948, + "loss": 0.82742298, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.18652344, + "step": 1980, + "time_per_iteration": 2.7659311294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137326, + "balance_loss_mlp": 1.11770415, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.10880870313335556, + "language_loss": 0.85352248, + "learning_rate": 0.0007101005373468908, + "loss": 0.86489582, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.19628906, + "step": 1981, + "time_per_iteration": 2.8786306381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130638, + "balance_loss_mlp": 1.11189866, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.09193767407328653, + "language_loss": 0.86793411, + "learning_rate": 0.0007098177934248242, + "loss": 0.87924051, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.18737793, + "step": 1982, + "time_per_iteration": 2.7491414546966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112619, + "balance_loss_mlp": 1.10644913, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.08063581171786138, + "language_loss": 0.85497284, + "learning_rate": 0.0007095349680442661, + "loss": 0.86623472, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.1973877, + "step": 1983, + "time_per_iteration": 2.8513927459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123414, + "balance_loss_mlp": 1.10408998, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.1315455004610476, + "language_loss": 0.79132575, + "learning_rate": 0.0007092520613150188, + "loss": 0.80255985, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.19299316, + "step": 1984, + "time_per_iteration": 2.7137770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122797, + "balance_loss_mlp": 1.1034615, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.07682315674204161, + "language_loss": 0.81457669, + "learning_rate": 0.0007089690733469165, + "loss": 0.82580465, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.1932373, + "step": 1985, + "time_per_iteration": 2.7019522190093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153334, + "balance_loss_mlp": 1.13452315, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.10399563311309594, + "language_loss": 0.82318014, + "learning_rate": 0.000708686004249825, + "loss": 0.83471346, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.18811035, + "step": 1986, + "time_per_iteration": 2.797624111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115288, + "balance_loss_mlp": 1.13355637, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.07772659738204864, + "language_loss": 0.91482198, + "learning_rate": 0.0007084028541336413, + "loss": 0.92635083, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.19299316, + "step": 1987, + "time_per_iteration": 2.7236177921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159354, + "balance_loss_mlp": 1.13969636, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.13308271196687566, + "language_loss": 0.86052763, + "learning_rate": 0.0007081196231082942, + "loss": 0.87212121, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.19641113, + "step": 1988, + "time_per_iteration": 2.837611198425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141061, + "balance_loss_mlp": 1.12171304, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.1253750556073725, + "language_loss": 0.79903424, + "learning_rate": 0.0007078363112837436, + "loss": 0.81044483, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.19335938, + "step": 1989, + "time_per_iteration": 2.8450546264648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135085, + "balance_loss_mlp": 1.11594021, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.06314586189395412, + "language_loss": 0.8480984, + "learning_rate": 0.000707552918769981, + "loss": 0.85944927, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.19128418, + "step": 1990, + "time_per_iteration": 2.5055301189422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117815, + "balance_loss_mlp": 1.09837222, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.09018786790446763, + "language_loss": 0.8355186, + "learning_rate": 0.000707269445677029, + "loss": 0.84669679, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.19433594, + "step": 1991, + "time_per_iteration": 2.790247917175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120171, + "balance_loss_mlp": 1.10065699, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.07803627169317769, + "language_loss": 0.8551231, + "learning_rate": 0.0007069858921149416, + "loss": 0.86632484, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.19494629, + "step": 1992, + "time_per_iteration": 2.9850950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128282, + "balance_loss_mlp": 1.10929155, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.08439673282063015, + "language_loss": 0.86369681, + "learning_rate": 0.0007067022581938043, + "loss": 0.87497962, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.18981934, + "step": 1993, + "time_per_iteration": 2.838817834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120054, + "balance_loss_mlp": 1.10115981, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.10464401531680585, + "language_loss": 0.83076423, + "learning_rate": 0.0007064185440237334, + "loss": 0.84196478, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.18884277, + "step": 1994, + "time_per_iteration": 2.7403006553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113897, + "balance_loss_mlp": 1.09485924, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.07520001194530918, + "language_loss": 0.8432954, + "learning_rate": 0.0007061347497148764, + "loss": 0.85443437, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.19018555, + "step": 1995, + "time_per_iteration": 2.797116994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117711, + "balance_loss_mlp": 1.0988524, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.10442861201560887, + "language_loss": 0.86312652, + "learning_rate": 0.0007058508753774122, + "loss": 0.87430364, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.18847656, + "step": 1996, + "time_per_iteration": 2.708909511566162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111759, + "balance_loss_mlp": 1.098791, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.07371207674818485, + "language_loss": 0.86599022, + "learning_rate": 0.0007055669211215505, + "loss": 0.87716615, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.18786621, + "step": 1997, + "time_per_iteration": 2.639425277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129297, + "balance_loss_mlp": 1.11073565, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.10349237512498541, + "language_loss": 0.77684987, + "learning_rate": 0.0007052828870575322, + "loss": 0.7881428, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.1854248, + "step": 1998, + "time_per_iteration": 2.6582653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141105, + "balance_loss_mlp": 1.12290192, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.06112561257491971, + "language_loss": 0.8669157, + "learning_rate": 0.0007049987732956291, + "loss": 0.87832677, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.18212891, + "step": 1999, + "time_per_iteration": 2.9868295192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130964, + "balance_loss_mlp": 1.11211705, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.05929570453342199, + "language_loss": 0.82587528, + "learning_rate": 0.0007047145799461439, + "loss": 0.83718491, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.18835449, + "step": 2000, + "time_per_iteration": 2.8687593936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136368, + "balance_loss_mlp": 1.11759257, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.08059531994541343, + "language_loss": 0.82050723, + "learning_rate": 0.00070443030711941, + "loss": 0.83187091, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.18762207, + "step": 2001, + "time_per_iteration": 2.7824347019195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113557, + "balance_loss_mlp": 1.11636579, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.09146293400396303, + "language_loss": 0.8213051, + "learning_rate": 0.0007041459549257924, + "loss": 0.83266079, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.19189453, + "step": 2002, + "time_per_iteration": 2.8634302616119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137137, + "balance_loss_mlp": 1.11758697, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.08512403296601297, + "language_loss": 0.78107333, + "learning_rate": 0.0007038615234756859, + "loss": 0.79244471, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.1953125, + "step": 2003, + "time_per_iteration": 3.2058236598968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136926, + "balance_loss_mlp": 1.11745918, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.07973278859066837, + "language_loss": 0.840294, + "learning_rate": 0.000703577012879517, + "loss": 0.85166335, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.19458008, + "step": 2004, + "time_per_iteration": 2.7286102771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144109, + "balance_loss_mlp": 1.12510681, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.07975228006523119, + "language_loss": 0.88714588, + "learning_rate": 0.0007032924232477423, + "loss": 0.89858699, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.19006348, + "step": 2005, + "time_per_iteration": 2.6980981826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136738, + "balance_loss_mlp": 1.11721206, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.08525396844891328, + "language_loss": 0.8036226, + "learning_rate": 0.0007030077546908493, + "loss": 0.81499004, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.19506836, + "step": 2006, + "time_per_iteration": 2.6433420181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225281, + "balance_loss_mlp": 1.21288347, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.07049383229006134, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84289944, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.12402344, + "step": 2007, + "time_per_iteration": 4.82226037979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113221, + "balance_loss_mlp": 1.11288631, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.07446306607004384, + "language_loss": 0.78622216, + "learning_rate": 0.0007024381812438117, + "loss": 0.7975443, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.19299316, + "step": 2008, + "time_per_iteration": 2.52738618850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128683, + "balance_loss_mlp": 1.10928798, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.09860455371344472, + "language_loss": 0.82941681, + "learning_rate": 0.0007021532765747951, + "loss": 0.84070361, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.19396973, + "step": 2009, + "time_per_iteration": 3.007847309112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135681, + "balance_loss_mlp": 1.115821, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.08526755269117656, + "language_loss": 0.79078948, + "learning_rate": 0.0007018682934229162, + "loss": 0.80214632, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.1986084, + "step": 2010, + "time_per_iteration": 2.9435882568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122545, + "balance_loss_mlp": 1.10262537, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.06758132101189684, + "language_loss": 0.82111001, + "learning_rate": 0.0007015832318988152, + "loss": 0.83233541, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.19909668, + "step": 2011, + "time_per_iteration": 2.6552624702453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_mlp": 1.03133512, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.01882295684379882, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74933803, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.12402344, + "step": 2012, + "time_per_iteration": 5.011860609054565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111441, + "balance_loss_mlp": 1.09159219, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.07301389252885741, + "language_loss": 0.84162498, + "learning_rate": 0.0007010128741766604, + "loss": 0.85273933, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.19836426, + "step": 2013, + "time_per_iteration": 2.766516923904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111771, + "balance_loss_mlp": 1.09080195, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.10834212581561939, + "language_loss": 0.84428859, + "learning_rate": 0.0007007275782000391, + "loss": 0.85540634, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.20983887, + "step": 2014, + "time_per_iteration": 2.6184933185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108065, + "balance_loss_mlp": 1.08796668, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.07735715793711462, + "language_loss": 0.8448838, + "learning_rate": 0.0007004422042940605, + "loss": 0.85596442, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.20092773, + "step": 2015, + "time_per_iteration": 2.5543320178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109418, + "balance_loss_mlp": 1.08941483, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.08270873816767256, + "language_loss": 0.89443475, + "learning_rate": 0.0007001567525695169, + "loss": 0.9055289, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.19995117, + "step": 2016, + "time_per_iteration": 2.6072936058044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106093, + "balance_loss_mlp": 1.08593512, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.06162053071135558, + "language_loss": 0.83763885, + "learning_rate": 0.0006998712231372303, + "loss": 0.84869981, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.20166016, + "step": 2017, + "time_per_iteration": 3.0785679817199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110727, + "balance_loss_mlp": 1.08730268, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06865572989075389, + "language_loss": 0.86015558, + "learning_rate": 0.0006995856161080532, + "loss": 0.87122822, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.19958496, + "step": 2018, + "time_per_iteration": 2.8914577960968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112506, + "balance_loss_mlp": 1.09202576, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.07931380391873609, + "language_loss": 0.82694459, + "learning_rate": 0.0006992999315928679, + "loss": 0.83806968, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.20483398, + "step": 2019, + "time_per_iteration": 2.7892749309539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110031, + "balance_loss_mlp": 1.08994412, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.08754557392654386, + "language_loss": 0.85419971, + "learning_rate": 0.0006990141697025871, + "loss": 0.8653, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.20080566, + "step": 2020, + "time_per_iteration": 2.7910003662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_mlp": 1.02712286, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.02439767662091094, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77398252, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.11474609, + "step": 2021, + "time_per_iteration": 4.809415340423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10596848, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0885537285439357, + "language_loss": 0.82239556, + "learning_rate": 0.0006984424142405392, + "loss": 0.83365172, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.19641113, + "step": 2022, + "time_per_iteration": 2.8510379791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124515, + "balance_loss_mlp": 1.10540605, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.08944143564846467, + "language_loss": 0.82328045, + "learning_rate": 0.0006981564208907474, + "loss": 0.83452559, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.19091797, + "step": 2023, + "time_per_iteration": 2.6450161933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125021, + "balance_loss_mlp": 1.10580468, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.06744861114448035, + "language_loss": 0.89889395, + "learning_rate": 0.0006978703506098102, + "loss": 0.91014421, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.19189453, + "step": 2024, + "time_per_iteration": 2.845273494720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142716, + "balance_loss_mlp": 1.12338066, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.22805579315722818, + "language_loss": 0.87903351, + "learning_rate": 0.00069758420350879, + "loss": 0.89046067, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.1932373, + "step": 2025, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147111, + "balance_loss_mlp": 1.12706041, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.08766781252639666, + "language_loss": 0.85837841, + "learning_rate": 0.000697297979698779, + "loss": 0.86984944, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.20043945, + "step": 2026, + "time_per_iteration": 2.7639670372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146919, + "balance_loss_mlp": 1.12766671, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06921765861152807, + "language_loss": 0.83379734, + "learning_rate": 0.0006970116792908992, + "loss": 0.84526652, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.19226074, + "step": 2027, + "time_per_iteration": 3.1537575721740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165828, + "balance_loss_mlp": 1.14574075, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.10608539967442848, + "language_loss": 0.81162727, + "learning_rate": 0.000696725302396302, + "loss": 0.82328546, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.20080566, + "step": 2028, + "time_per_iteration": 2.713486671447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169814, + "balance_loss_mlp": 1.14985871, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.08953149679914804, + "language_loss": 0.85771465, + "learning_rate": 0.0006964388491261692, + "loss": 0.86941278, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.19946289, + "step": 2029, + "time_per_iteration": 3.2685461044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117313, + "balance_loss_mlp": 1.15280437, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.07138064393758646, + "language_loss": 0.87465048, + "learning_rate": 0.0006961523195917114, + "loss": 0.88638175, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.20324707, + "step": 2030, + "time_per_iteration": 2.8363735675811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173533, + "balance_loss_mlp": 1.15370905, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.07919234366723153, + "language_loss": 0.78095168, + "learning_rate": 0.0006958657139041696, + "loss": 0.792687, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.19812012, + "step": 2031, + "time_per_iteration": 2.7535581588745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093882, + "balance_loss_mlp": 1.0820564, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.028372833662772774, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77806854, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.11816406, + "step": 2032, + "time_per_iteration": 4.918071508407593 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162947, + "balance_loss_mlp": 1.14219236, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.08595509799025135, + "language_loss": 0.78080893, + "learning_rate": 0.0006952922745149434, + "loss": 0.79243839, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.2076416, + "step": 2033, + "time_per_iteration": 2.6598660945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160858, + "balance_loss_mlp": 1.14035416, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.06804618944659446, + "language_loss": 0.87450963, + "learning_rate": 0.000695005441035888, + "loss": 0.88611823, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.20507812, + "step": 2034, + "time_per_iteration": 2.6846048831939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_mlp": 1.06218028, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.025244772676945967, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7479701, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.11376953, + "step": 2035, + "time_per_iteration": 4.866973638534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.12698257, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.06481204645981475, + "language_loss": 0.80968261, + "learning_rate": 0.0006944315470656863, + "loss": 0.82115912, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.20678711, + "step": 2036, + "time_per_iteration": 2.973759412765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139407, + "balance_loss_mlp": 1.11935592, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.08143475646221604, + "language_loss": 0.90850043, + "learning_rate": 0.000694144486797345, + "loss": 0.91989452, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.20043945, + "step": 2037, + "time_per_iteration": 2.736645221710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042479, + "balance_loss_mlp": 1.03184605, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.02072601949350613, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80562913, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.10644531, + "step": 2038, + "time_per_iteration": 4.651543140411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130224, + "balance_loss_mlp": 1.11029196, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.08780788201299033, + "language_loss": 0.89056122, + "learning_rate": 0.0006935701402514156, + "loss": 0.90186346, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.19921875, + "step": 2039, + "time_per_iteration": 2.610884666442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025416, + "balance_loss_mlp": 1.01525903, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.013600241372588764, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74060309, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.1015625, + "step": 2040, + "time_per_iteration": 4.982971906661987 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139694, + "balance_loss_mlp": 1.12033463, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.07758123210342138, + "language_loss": 0.84211379, + "learning_rate": 0.0006929954931031422, + "loss": 0.85351074, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.19348145, + "step": 2041, + "time_per_iteration": 3.722700595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143114, + "balance_loss_mlp": 1.12322998, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.05684242147097161, + "language_loss": 0.88287592, + "learning_rate": 0.0006927080570819805, + "loss": 0.89430702, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.19885254, + "step": 2042, + "time_per_iteration": 2.6228466033935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146557, + "balance_loss_mlp": 1.12712598, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.09880041485830528, + "language_loss": 0.80978543, + "learning_rate": 0.0006924205462449161, + "loss": 0.82125103, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.19421387, + "step": 2043, + "time_per_iteration": 2.5959606170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130878, + "balance_loss_mlp": 1.11204302, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.07421884933278829, + "language_loss": 0.81996524, + "learning_rate": 0.0006921329607035702, + "loss": 0.83127403, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.18823242, + "step": 2044, + "time_per_iteration": 3.2492971420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112622, + "balance_loss_mlp": 1.10749173, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.0837559423677037, + "language_loss": 0.87882477, + "learning_rate": 0.0006918453005695938, + "loss": 0.89008695, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.18701172, + "step": 2045, + "time_per_iteration": 2.649426221847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120666, + "balance_loss_mlp": 1.10098422, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.0619155211719984, + "language_loss": 0.84122574, + "learning_rate": 0.0006915575659546662, + "loss": 0.85243243, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.19665527, + "step": 2046, + "time_per_iteration": 2.7105627059936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109896, + "balance_loss_mlp": 1.09044123, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.0891593284161872, + "language_loss": 0.80576289, + "learning_rate": 0.0006912697569704959, + "loss": 0.81686187, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.19445801, + "step": 2047, + "time_per_iteration": 2.700460910797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117623, + "balance_loss_mlp": 1.09800088, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.09048948583390962, + "language_loss": 0.86559486, + "learning_rate": 0.0006909818737288205, + "loss": 0.87677109, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.19604492, + "step": 2048, + "time_per_iteration": 2.593365430831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122837, + "balance_loss_mlp": 1.10311985, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.0812760632256331, + "language_loss": 0.8078903, + "learning_rate": 0.000690693916341406, + "loss": 0.81911868, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.19702148, + "step": 2049, + "time_per_iteration": 2.6433444023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114252, + "balance_loss_mlp": 1.09472609, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.0788936263124851, + "language_loss": 0.82210761, + "learning_rate": 0.0006904058849200475, + "loss": 0.83325016, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.19506836, + "step": 2050, + "time_per_iteration": 2.7488439083099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114662, + "balance_loss_mlp": 1.09468246, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.10945632429468012, + "language_loss": 0.8477484, + "learning_rate": 0.0006901177795765683, + "loss": 0.858895, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.19970703, + "step": 2051, + "time_per_iteration": 2.6071059703826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101751, + "balance_loss_mlp": 1.08223617, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.07628310806963638, + "language_loss": 0.81390727, + "learning_rate": 0.0006898296004228213, + "loss": 0.82492483, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.19494629, + "step": 2052, + "time_per_iteration": 2.725609540939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195172, + "balance_loss_mlp": 1.18334627, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.06244005501870815, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79321915, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.11816406, + "step": 2053, + "time_per_iteration": 4.871281862258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111122, + "balance_loss_mlp": 1.09123778, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.08281763462186637, + "language_loss": 0.79986715, + "learning_rate": 0.0006892530211320763, + "loss": 0.81097841, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.19873047, + "step": 2054, + "time_per_iteration": 2.7042620182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125901, + "balance_loss_mlp": 1.10589778, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.08642547559894523, + "language_loss": 0.83690774, + "learning_rate": 0.000688964621218926, + "loss": 0.8481667, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.19995117, + "step": 2055, + "time_per_iteration": 2.6359920501708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120805, + "balance_loss_mlp": 1.10112405, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.10380118482872411, + "language_loss": 0.79915357, + "learning_rate": 0.0006886761479432037, + "loss": 0.81036162, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.19665527, + "step": 2056, + "time_per_iteration": 2.872950792312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122886, + "balance_loss_mlp": 1.10250163, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.07844536568455973, + "language_loss": 0.8461678, + "learning_rate": 0.0006883876014169045, + "loss": 0.8573966, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.20385742, + "step": 2057, + "time_per_iteration": 2.5555264949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132813, + "balance_loss_mlp": 1.11285698, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.08268955880836791, + "language_loss": 0.90132928, + "learning_rate": 0.000688098981752052, + "loss": 0.91265738, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.19946289, + "step": 2058, + "time_per_iteration": 2.7518441677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134779, + "balance_loss_mlp": 1.11504984, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.09934928750763956, + "language_loss": 0.80161107, + "learning_rate": 0.0006878102890606982, + "loss": 0.81295884, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.19726562, + "step": 2059, + "time_per_iteration": 3.098393678665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.10231209, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.08965795352869743, + "language_loss": 0.80914015, + "learning_rate": 0.0006875215234549239, + "loss": 0.82036376, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.20043945, + "step": 2060, + "time_per_iteration": 2.591871976852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112284, + "balance_loss_mlp": 1.10284913, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.08963098282996143, + "language_loss": 0.85349464, + "learning_rate": 0.0006872326850468376, + "loss": 0.86472309, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.19995117, + "step": 2061, + "time_per_iteration": 2.7322757244110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121862, + "balance_loss_mlp": 1.10210919, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.08450203568488315, + "language_loss": 0.78602254, + "learning_rate": 0.0006869437739485762, + "loss": 0.79724109, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.19750977, + "step": 2062, + "time_per_iteration": 2.679453134536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09274697, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.07578248331540363, + "language_loss": 0.92750496, + "learning_rate": 0.0006866547902723053, + "loss": 0.93862933, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.19677734, + "step": 2063, + "time_per_iteration": 2.680661201477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100055, + "balance_loss_mlp": 1.08058822, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.07543651474129125, + "language_loss": 0.80317062, + "learning_rate": 0.000686365734130218, + "loss": 0.8141712, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.19458008, + "step": 2064, + "time_per_iteration": 2.695892095565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106834, + "balance_loss_mlp": 1.08669949, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.08078876442086359, + "language_loss": 0.84065503, + "learning_rate": 0.000686076605634536, + "loss": 0.85172331, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.20129395, + "step": 2065, + "time_per_iteration": 2.642617702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113253, + "balance_loss_mlp": 1.0935117, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.08876156008903276, + "language_loss": 0.84441757, + "learning_rate": 0.0006857874048975088, + "loss": 0.85555011, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.19726562, + "step": 2066, + "time_per_iteration": 2.6363344192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_mlp": 1.08237755, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.06515627567230846, + "language_loss": 0.87180257, + "learning_rate": 0.0006854981320314142, + "loss": 0.88282633, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.19995117, + "step": 2067, + "time_per_iteration": 2.510763645172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105961, + "balance_loss_mlp": 1.08644629, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.08362186096435482, + "language_loss": 0.86780995, + "learning_rate": 0.0006852087871485579, + "loss": 0.87886953, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.19506836, + "step": 2068, + "time_per_iteration": 2.653662919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.08698964, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.09469661693362608, + "language_loss": 0.81769943, + "learning_rate": 0.0006849193703612735, + "loss": 0.82876104, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.19177246, + "step": 2069, + "time_per_iteration": 2.7798843383789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.0750916, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.07513124412486355, + "language_loss": 0.77589542, + "learning_rate": 0.0006846298817819225, + "loss": 0.78684515, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.19873047, + "step": 2070, + "time_per_iteration": 2.984025716781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094931, + "balance_loss_mlp": 1.07543969, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.07496601113124422, + "language_loss": 0.80744815, + "learning_rate": 0.0006843403215228945, + "loss": 0.8183974, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.19482422, + "step": 2071, + "time_per_iteration": 2.4528424739837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113518, + "balance_loss_mlp": 1.09400368, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.10952507549773222, + "language_loss": 0.80553752, + "learning_rate": 0.0006840506896966065, + "loss": 0.81667268, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.19519043, + "step": 2072, + "time_per_iteration": 2.7193689346313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113479, + "balance_loss_mlp": 1.09405994, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.07287911350271854, + "language_loss": 0.81897116, + "learning_rate": 0.0006837609864155038, + "loss": 0.8301059, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.1940918, + "step": 2073, + "time_per_iteration": 2.9260082244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110629, + "balance_loss_mlp": 1.09179354, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.0731734663182413, + "language_loss": 0.83157325, + "learning_rate": 0.0006834712117920592, + "loss": 0.8426795, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.18823242, + "step": 2074, + "time_per_iteration": 2.629744052886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117154, + "balance_loss_mlp": 1.09769917, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.07643256719558747, + "language_loss": 0.85673088, + "learning_rate": 0.0006831813659387729, + "loss": 0.8679024, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.19433594, + "step": 2075, + "time_per_iteration": 2.5350148677825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116666, + "balance_loss_mlp": 1.0971514, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.07671111115245405, + "language_loss": 0.84214932, + "learning_rate": 0.0006828914489681733, + "loss": 0.85331595, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.19494629, + "step": 2076, + "time_per_iteration": 2.724330425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125458, + "balance_loss_mlp": 1.10627747, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.08210563860740908, + "language_loss": 0.85224628, + "learning_rate": 0.0006826014609928162, + "loss": 0.86350089, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.19165039, + "step": 2077, + "time_per_iteration": 2.737734079360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070244, + "balance_loss_mlp": 1.06070685, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.03932449118700248, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84269631, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.09521484, + "step": 2078, + "time_per_iteration": 4.887951612472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124282, + "balance_loss_mlp": 1.10458827, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.09240147129054761, + "language_loss": 0.80077326, + "learning_rate": 0.0006820212724781896, + "loss": 0.81201607, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.19677734, + "step": 2079, + "time_per_iteration": 2.6855874061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114733, + "balance_loss_mlp": 1.09537315, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.0724055342629082, + "language_loss": 0.84239459, + "learning_rate": 0.0006817310721641694, + "loss": 0.85354191, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.19335938, + "step": 2080, + "time_per_iteration": 2.902536392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122461, + "balance_loss_mlp": 1.10289896, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.0894692108770988, + "language_loss": 0.83972865, + "learning_rate": 0.00068144080129589, + "loss": 0.85095322, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.19543457, + "step": 2081, + "time_per_iteration": 2.613067865371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122596, + "balance_loss_mlp": 1.1030333, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.09472281695894083, + "language_loss": 0.82174724, + "learning_rate": 0.0006811504599860441, + "loss": 0.83297324, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.19555664, + "step": 2082, + "time_per_iteration": 2.6002771854400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111624, + "balance_loss_mlp": 1.09634447, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.06828551193852998, + "language_loss": 0.85353184, + "learning_rate": 0.0006808600483473526, + "loss": 0.86469424, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.19897461, + "step": 2083, + "time_per_iteration": 2.9010846614837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107422, + "balance_loss_mlp": 1.0885756, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.07802980838834611, + "language_loss": 0.8652671, + "learning_rate": 0.0006805695664925629, + "loss": 0.87634128, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.18823242, + "step": 2084, + "time_per_iteration": 2.8027803897857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111632, + "balance_loss_mlp": 1.0970912, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.08245020261724635, + "language_loss": 0.8423562, + "learning_rate": 0.0006802790145344506, + "loss": 0.85351944, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.19238281, + "step": 2085, + "time_per_iteration": 2.5397531986236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119142, + "balance_loss_mlp": 1.10039067, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07508565386227965, + "language_loss": 0.87270218, + "learning_rate": 0.0006799883925858176, + "loss": 0.88389367, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.18737793, + "step": 2086, + "time_per_iteration": 2.876164197921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112869, + "balance_loss_mlp": 1.10978329, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.07429159623595777, + "language_loss": 0.84809011, + "learning_rate": 0.0006796977007594933, + "loss": 0.85937703, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.18896484, + "step": 2087, + "time_per_iteration": 2.6302778720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136639, + "balance_loss_mlp": 1.11681485, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.06510767025647884, + "language_loss": 0.86000383, + "learning_rate": 0.0006794069391683345, + "loss": 0.8713702, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.19824219, + "step": 2088, + "time_per_iteration": 2.7642226219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125568, + "balance_loss_mlp": 1.10582721, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.07763642733040174, + "language_loss": 0.80219448, + "learning_rate": 0.0006791161079252248, + "loss": 0.81345016, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.19726562, + "step": 2089, + "time_per_iteration": 2.6216719150543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112898, + "balance_loss_mlp": 1.10969198, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.06753993516242088, + "language_loss": 0.82396168, + "learning_rate": 0.0006788252071430747, + "loss": 0.83525145, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.19262695, + "step": 2090, + "time_per_iteration": 2.6881613731384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136792, + "balance_loss_mlp": 1.11759949, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.07938192983074185, + "language_loss": 0.86496997, + "learning_rate": 0.0006785342369348222, + "loss": 0.87633789, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.19177246, + "step": 2091, + "time_per_iteration": 2.7187774181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134169, + "balance_loss_mlp": 1.11497617, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.08007566317284716, + "language_loss": 0.79674286, + "learning_rate": 0.0006782431974134316, + "loss": 0.80808461, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.19189453, + "step": 2092, + "time_per_iteration": 2.5497889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112292, + "balance_loss_mlp": 1.10301197, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.09546920549904063, + "language_loss": 0.89602369, + "learning_rate": 0.0006779520886918949, + "loss": 0.90725285, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.19897461, + "step": 2093, + "time_per_iteration": 3.070051431655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126954, + "balance_loss_mlp": 1.10783303, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.07932487566864904, + "language_loss": 0.81140947, + "learning_rate": 0.0006776609108832301, + "loss": 0.82267904, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.19116211, + "step": 2094, + "time_per_iteration": 2.8635079860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117981, + "balance_loss_mlp": 1.09895563, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.08200776323916202, + "language_loss": 0.85093951, + "learning_rate": 0.0006773696641004828, + "loss": 0.86211932, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.19006348, + "step": 2095, + "time_per_iteration": 2.569387435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119321, + "balance_loss_mlp": 1.09972358, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.09231967023328698, + "language_loss": 0.77639973, + "learning_rate": 0.0006770783484567247, + "loss": 0.78759301, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.19592285, + "step": 2096, + "time_per_iteration": 3.1237080097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109679, + "balance_loss_mlp": 1.09033108, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.07679281592908915, + "language_loss": 0.86043823, + "learning_rate": 0.000676786964065055, + "loss": 0.871535, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.1932373, + "step": 2097, + "time_per_iteration": 2.785017728805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.10181427, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.07049509838223245, + "language_loss": 0.78567326, + "learning_rate": 0.0006764955110385986, + "loss": 0.79688537, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.19384766, + "step": 2098, + "time_per_iteration": 2.7599899768829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011178, + "balance_loss_mlp": 1.09878576, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.07587511524565468, + "language_loss": 0.8025918, + "learning_rate": 0.0006762039894905083, + "loss": 0.81376982, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.19006348, + "step": 2099, + "time_per_iteration": 2.6616034507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115132, + "balance_loss_mlp": 1.09635651, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.08446355623188201, + "language_loss": 0.80088019, + "learning_rate": 0.000675912399533962, + "loss": 0.81203151, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.1875, + "step": 2100, + "time_per_iteration": 2.53584885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129908, + "balance_loss_mlp": 1.1112045, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.057425192194628195, + "language_loss": 0.84893382, + "learning_rate": 0.0006756207412821656, + "loss": 0.86023289, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.18701172, + "step": 2101, + "time_per_iteration": 3.0146372318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133046, + "balance_loss_mlp": 1.11444974, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.08385244443422216, + "language_loss": 0.79946959, + "learning_rate": 0.0006753290148483505, + "loss": 0.81080002, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.18603516, + "step": 2102, + "time_per_iteration": 3.1141843795776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131753, + "balance_loss_mlp": 1.11306119, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.10321495678621663, + "language_loss": 0.7855078, + "learning_rate": 0.0006750372203457752, + "loss": 0.79682529, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.18688965, + "step": 2103, + "time_per_iteration": 2.5273704528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133211, + "balance_loss_mlp": 1.1144712, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.06897182936898366, + "language_loss": 0.86569643, + "learning_rate": 0.0006747453578877242, + "loss": 0.87702858, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.18725586, + "step": 2104, + "time_per_iteration": 2.7731292247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136147, + "balance_loss_mlp": 1.11752641, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.08357448735589112, + "language_loss": 0.82917869, + "learning_rate": 0.0006744534275875085, + "loss": 0.84054017, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.1862793, + "step": 2105, + "time_per_iteration": 3.0466742515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148022, + "balance_loss_mlp": 1.12974763, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.09276188373090515, + "language_loss": 0.85562009, + "learning_rate": 0.0006741614295584657, + "loss": 0.8671003, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.18273926, + "step": 2106, + "time_per_iteration": 2.678776264190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115565, + "balance_loss_mlp": 1.13704157, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.0813184956351506, + "language_loss": 0.78235412, + "learning_rate": 0.0006738693639139595, + "loss": 0.79391062, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.18603516, + "step": 2107, + "time_per_iteration": 3.0155587196350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157609, + "balance_loss_mlp": 1.13920343, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.09421684944263367, + "language_loss": 0.77232802, + "learning_rate": 0.0006735772307673796, + "loss": 0.78390408, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.18408203, + "step": 2108, + "time_per_iteration": 3.586928129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165053, + "balance_loss_mlp": 1.14651608, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.06861239024528153, + "language_loss": 0.83003211, + "learning_rate": 0.0006732850302321421, + "loss": 0.84168267, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.18518066, + "step": 2109, + "time_per_iteration": 2.9429726600646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160041, + "balance_loss_mlp": 1.14086008, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.07515968908819307, + "language_loss": 0.84144229, + "learning_rate": 0.00067299276242169, + "loss": 0.85304272, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.19177246, + "step": 2110, + "time_per_iteration": 2.6710071563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044281, + "balance_loss_mlp": 1.03436232, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.023257265358085616, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75426447, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.09912109, + "step": 2111, + "time_per_iteration": 4.914813756942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151064, + "balance_loss_mlp": 1.13221717, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.09830411974127871, + "language_loss": 0.77889705, + "learning_rate": 0.0006724080254290395, + "loss": 0.79040766, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.18811035, + "step": 2112, + "time_per_iteration": 2.8067259788513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136038, + "balance_loss_mlp": 1.11665511, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.07964969066506762, + "language_loss": 0.89744002, + "learning_rate": 0.0006721155564738566, + "loss": 0.90880042, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.19360352, + "step": 2113, + "time_per_iteration": 2.7009260654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105786, + "balance_loss_mlp": 1.04798985, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.033284036056789104, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79680502, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.09863281, + "step": 2114, + "time_per_iteration": 4.983005523681641 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127405, + "balance_loss_mlp": 1.10823643, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.07850906735960049, + "language_loss": 0.85233408, + "learning_rate": 0.0006715304182135078, + "loss": 0.86360812, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.19152832, + "step": 2115, + "time_per_iteration": 2.6078672409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114305, + "balance_loss_mlp": 1.09480286, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.063032684383759, + "language_loss": 0.88685012, + "learning_rate": 0.0006712377491355127, + "loss": 0.89799315, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.19482422, + "step": 2116, + "time_per_iteration": 2.8919928073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011132, + "balance_loss_mlp": 1.09403157, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.07591389839440288, + "language_loss": 0.81216896, + "learning_rate": 0.0006709450135771274, + "loss": 0.82330096, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.19152832, + "step": 2117, + "time_per_iteration": 2.948209524154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110152, + "balance_loss_mlp": 1.09097123, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.0664106663118444, + "language_loss": 0.86270058, + "learning_rate": 0.0006706522116520023, + "loss": 0.87380207, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.19177246, + "step": 2118, + "time_per_iteration": 2.63297963142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109766, + "balance_loss_mlp": 1.09078836, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.08309315753094405, + "language_loss": 0.82646739, + "learning_rate": 0.0006703593434738127, + "loss": 0.83756506, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.18969727, + "step": 2119, + "time_per_iteration": 2.7504334449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110305, + "balance_loss_mlp": 1.08339202, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.06315918122435989, + "language_loss": 0.78157568, + "learning_rate": 0.0006700664091562604, + "loss": 0.79260623, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.1965332, + "step": 2120, + "time_per_iteration": 2.5809123516082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08968461, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.06251573302429693, + "language_loss": 0.84974718, + "learning_rate": 0.0006697734088130725, + "loss": 0.86084229, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.19812012, + "step": 2121, + "time_per_iteration": 2.6444742679595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103604, + "balance_loss_mlp": 1.08350492, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.08444724355881765, + "language_loss": 0.85282058, + "learning_rate": 0.0006694803425580018, + "loss": 0.86385661, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.20080566, + "step": 2122, + "time_per_iteration": 2.9844353199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101862, + "balance_loss_mlp": 1.08126235, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.08120556309716129, + "language_loss": 0.84838599, + "learning_rate": 0.0006691872105048268, + "loss": 0.85940456, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.20605469, + "step": 2123, + "time_per_iteration": 2.587648868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104271, + "balance_loss_mlp": 1.08323061, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.07277240915985977, + "language_loss": 0.84579539, + "learning_rate": 0.0006688940127673513, + "loss": 0.85683805, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.21044922, + "step": 2124, + "time_per_iteration": 2.6976451873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108248, + "balance_loss_mlp": 1.08663535, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.07888289921071225, + "language_loss": 0.85375637, + "learning_rate": 0.0006686007494594049, + "loss": 0.86483884, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.21618652, + "step": 2125, + "time_per_iteration": 2.842721700668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.075279, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.1494487487543463, + "language_loss": 0.80707026, + "learning_rate": 0.0006683074206948425, + "loss": 0.81803596, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.21289062, + "step": 2126, + "time_per_iteration": 2.54156231880188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088684, + "balance_loss_mlp": 1.06790602, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.07127639192135228, + "language_loss": 0.81315231, + "learning_rate": 0.0006680140265875443, + "loss": 0.82403916, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.20788574, + "step": 2127, + "time_per_iteration": 2.8282980918884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093135, + "balance_loss_mlp": 1.07241678, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.07736719826860473, + "language_loss": 0.953547, + "learning_rate": 0.0006677205672514162, + "loss": 0.96447837, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.20715332, + "step": 2128, + "time_per_iteration": 2.635601758956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089263, + "balance_loss_mlp": 1.06965339, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.07314070036202396, + "language_loss": 0.88630438, + "learning_rate": 0.000667427042800389, + "loss": 0.89719707, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.19604492, + "step": 2129, + "time_per_iteration": 2.792956829071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094452, + "balance_loss_mlp": 1.07447219, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.07258896862524182, + "language_loss": 0.82793128, + "learning_rate": 0.0006671334533484192, + "loss": 0.83887583, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.19970703, + "step": 2130, + "time_per_iteration": 2.773900270462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07694483, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.07325583153216161, + "language_loss": 0.83178955, + "learning_rate": 0.0006668397990094881, + "loss": 0.84274781, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.1887207, + "step": 2131, + "time_per_iteration": 2.752606153488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.08409071, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.08072513277707091, + "language_loss": 0.84810466, + "learning_rate": 0.0006665460798976027, + "loss": 0.85913295, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.18725586, + "step": 2132, + "time_per_iteration": 2.7918195724487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101646, + "balance_loss_mlp": 1.08277488, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.057661652953568024, + "language_loss": 0.8113941, + "learning_rate": 0.0006662522961267947, + "loss": 0.82241058, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.18859863, + "step": 2133, + "time_per_iteration": 2.7084174156188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114188, + "balance_loss_mlp": 1.09586525, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.07117823449693282, + "language_loss": 0.86957145, + "learning_rate": 0.0006659584478111211, + "loss": 0.88071334, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.18322754, + "step": 2134, + "time_per_iteration": 2.8745734691619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120532, + "balance_loss_mlp": 1.10234094, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.10436544040673855, + "language_loss": 0.82673836, + "learning_rate": 0.000665664535064664, + "loss": 0.83794367, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.1817627, + "step": 2135, + "time_per_iteration": 3.0361244678497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_mlp": 1.10167265, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.07372821186051039, + "language_loss": 0.82676935, + "learning_rate": 0.0006653705580015303, + "loss": 0.83797693, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.1907959, + "step": 2136, + "time_per_iteration": 2.6784329414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121286, + "balance_loss_mlp": 1.10184264, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.08099943161450797, + "language_loss": 0.8610462, + "learning_rate": 0.0006650765167358523, + "loss": 0.87225902, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.19421387, + "step": 2137, + "time_per_iteration": 2.8350300788879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113363, + "balance_loss_mlp": 1.09431374, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.09328592607957716, + "language_loss": 0.89696336, + "learning_rate": 0.0006647824113817864, + "loss": 0.90809703, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.19030762, + "step": 2138, + "time_per_iteration": 2.5345799922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112598, + "balance_loss_mlp": 1.09391761, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.24980936370747706, + "language_loss": 0.81674927, + "learning_rate": 0.000664488242053515, + "loss": 0.82787526, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.18688965, + "step": 2139, + "time_per_iteration": 2.729074716567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112354, + "balance_loss_mlp": 1.09430587, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.06520257719296937, + "language_loss": 0.8372556, + "learning_rate": 0.0006641940088652445, + "loss": 0.84837914, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.18054199, + "step": 2140, + "time_per_iteration": 2.822861909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114304, + "balance_loss_mlp": 1.09476542, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.09690666410410188, + "language_loss": 0.82505018, + "learning_rate": 0.0006638997119312065, + "loss": 0.8361932, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.1953125, + "step": 2141, + "time_per_iteration": 2.7164361476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081351, + "balance_loss_mlp": 1.0707655, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.03550975461959617, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.7614466, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.10595703, + "step": 2142, + "time_per_iteration": 4.951165437698364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116466, + "balance_loss_mlp": 1.09734452, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.10349541439789608, + "language_loss": 0.8488189, + "learning_rate": 0.000663310927282877, + "loss": 0.8599835, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.19116211, + "step": 2143, + "time_per_iteration": 2.834325075149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123685, + "balance_loss_mlp": 1.10346723, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.07414481576642443, + "language_loss": 0.85735166, + "learning_rate": 0.000663016439797172, + "loss": 0.86858845, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.20214844, + "step": 2144, + "time_per_iteration": 2.641390800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118144, + "balance_loss_mlp": 1.09814095, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.07853696289984005, + "language_loss": 0.80941319, + "learning_rate": 0.0006627218890228724, + "loss": 0.82059467, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.20007324, + "step": 2145, + "time_per_iteration": 2.7847142219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115703, + "balance_loss_mlp": 1.0958544, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.07518431098775835, + "language_loss": 0.83727562, + "learning_rate": 0.0006624272750743326, + "loss": 0.84843272, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.19836426, + "step": 2146, + "time_per_iteration": 3.0317938327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117359, + "balance_loss_mlp": 1.09733224, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.06462993006694184, + "language_loss": 0.8283999, + "learning_rate": 0.0006621325980659322, + "loss": 0.83957344, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.20019531, + "step": 2147, + "time_per_iteration": 2.786724328994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118, + "balance_loss_mlp": 1.0978415, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.10640671392978962, + "language_loss": 0.81600213, + "learning_rate": 0.000661837858112075, + "loss": 0.82718211, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.20153809, + "step": 2148, + "time_per_iteration": 2.854837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115633, + "balance_loss_mlp": 1.09577227, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.06752887879335369, + "language_loss": 0.88443303, + "learning_rate": 0.0006615430553271888, + "loss": 0.89558935, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.19848633, + "step": 2149, + "time_per_iteration": 2.8243539333343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115349, + "balance_loss_mlp": 1.09486902, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.06757702274708675, + "language_loss": 0.85010874, + "learning_rate": 0.0006612481898257264, + "loss": 0.8612622, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.20483398, + "step": 2150, + "time_per_iteration": 2.870486259460449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114158, + "balance_loss_mlp": 1.09377337, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.08316851802653256, + "language_loss": 0.85005617, + "learning_rate": 0.000660953261722165, + "loss": 0.86119783, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.20385742, + "step": 2151, + "time_per_iteration": 2.6056485176086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112582, + "balance_loss_mlp": 1.09265018, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.06870221870710541, + "language_loss": 0.82367688, + "learning_rate": 0.0006606582711310055, + "loss": 0.83480269, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.19934082, + "step": 2152, + "time_per_iteration": 2.7264139652252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119446, + "balance_loss_mlp": 1.09854901, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.0720639200532027, + "language_loss": 0.83059323, + "learning_rate": 0.0006603632181667736, + "loss": 0.8417877, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.20910645, + "step": 2153, + "time_per_iteration": 2.6930761337280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055226, + "balance_loss_mlp": 1.04149318, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.029268536031501605, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79998553, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.13769531, + "step": 2154, + "time_per_iteration": 4.951904773712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133368, + "balance_loss_mlp": 1.11335301, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.08213185756435645, + "language_loss": 0.81797659, + "learning_rate": 0.0006597729255773153, + "loss": 0.82931024, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.20007324, + "step": 2155, + "time_per_iteration": 2.6153218746185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142819, + "balance_loss_mlp": 1.1224227, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.0847752552783981, + "language_loss": 0.82203597, + "learning_rate": 0.0006594776861812608, + "loss": 0.83346415, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.20397949, + "step": 2156, + "time_per_iteration": 2.68922758102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153838, + "balance_loss_mlp": 1.13410926, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.06809079383741527, + "language_loss": 0.86262864, + "learning_rate": 0.0006591823848704776, + "loss": 0.87416703, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.1973877, + "step": 2157, + "time_per_iteration": 2.9523754119873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147297, + "balance_loss_mlp": 1.12693584, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.07690135227418383, + "language_loss": 0.81358635, + "learning_rate": 0.0006588870217596117, + "loss": 0.82505929, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.20361328, + "step": 2158, + "time_per_iteration": 2.7730822563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140876, + "balance_loss_mlp": 1.12146926, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.08370852265526307, + "language_loss": 0.857876, + "learning_rate": 0.0006585915969633334, + "loss": 0.86928475, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.19396973, + "step": 2159, + "time_per_iteration": 2.6628706455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133859, + "balance_loss_mlp": 1.1143918, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.07868666241976846, + "language_loss": 0.8926276, + "learning_rate": 0.0006582961105963366, + "loss": 0.90396619, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.19445801, + "step": 2160, + "time_per_iteration": 2.856227397918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126433, + "balance_loss_mlp": 1.10702562, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.10110909063497833, + "language_loss": 0.77701914, + "learning_rate": 0.0006580005627733395, + "loss": 0.78828347, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.19396973, + "step": 2161, + "time_per_iteration": 2.763690948486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131659, + "balance_loss_mlp": 1.11281204, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.0788483903527846, + "language_loss": 0.81671721, + "learning_rate": 0.0006577049536090838, + "loss": 0.8280338, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.18823242, + "step": 2162, + "time_per_iteration": 2.7156083583831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130408, + "balance_loss_mlp": 1.11114359, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.08609543464950487, + "language_loss": 0.85536218, + "learning_rate": 0.000657409283218335, + "loss": 0.8666662, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.19250488, + "step": 2163, + "time_per_iteration": 2.711332082748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135342, + "balance_loss_mlp": 1.11707878, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.08463355465100361, + "language_loss": 0.81072271, + "learning_rate": 0.0006571135517158829, + "loss": 0.82207608, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.18273926, + "step": 2164, + "time_per_iteration": 2.6715452671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_mlp": 1.01865911, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.01758070932569607, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77793115, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.10400391, + "step": 2165, + "time_per_iteration": 4.765650272369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154419, + "balance_loss_mlp": 1.13588202, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.09117992314911788, + "language_loss": 0.828076, + "learning_rate": 0.0006565219058351444, + "loss": 0.83962023, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.18530273, + "step": 2166, + "time_per_iteration": 2.568162202835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160705, + "balance_loss_mlp": 1.14153659, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.1435965153845973, + "language_loss": 0.82720423, + "learning_rate": 0.0006562259916865553, + "loss": 0.83881128, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.19165039, + "step": 2167, + "time_per_iteration": 2.577831506729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146961, + "balance_loss_mlp": 1.12813759, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.10197305761412122, + "language_loss": 0.79348731, + "learning_rate": 0.0006559300168856573, + "loss": 0.80495691, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.18798828, + "step": 2168, + "time_per_iteration": 2.7849843502044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143742, + "balance_loss_mlp": 1.12485933, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.07754195288754885, + "language_loss": 0.86023396, + "learning_rate": 0.0006556339815473577, + "loss": 0.87167138, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.1887207, + "step": 2169, + "time_per_iteration": 2.7085328102111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142677, + "balance_loss_mlp": 1.12390125, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.08981224380419678, + "language_loss": 0.86090291, + "learning_rate": 0.000655337885786588, + "loss": 0.87232965, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.1875, + "step": 2170, + "time_per_iteration": 2.9244213104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128382, + "balance_loss_mlp": 1.10963011, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.08419137591764536, + "language_loss": 0.8483454, + "learning_rate": 0.0006550417297183025, + "loss": 0.85962915, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.18737793, + "step": 2171, + "time_per_iteration": 2.6424126625061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116149, + "balance_loss_mlp": 1.09746861, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.07276027667818112, + "language_loss": 0.81700563, + "learning_rate": 0.0006547455134574793, + "loss": 0.82816714, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.18664551, + "step": 2172, + "time_per_iteration": 2.743807315826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118053, + "balance_loss_mlp": 1.09920597, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06582530373346562, + "language_loss": 0.83907378, + "learning_rate": 0.0006544492371191198, + "loss": 0.85025424, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.18847656, + "step": 2173, + "time_per_iteration": 3.1398048400878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112075, + "balance_loss_mlp": 1.09203625, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.07927924785081189, + "language_loss": 0.83028531, + "learning_rate": 0.0006541529008182485, + "loss": 0.84140611, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.20031738, + "step": 2174, + "time_per_iteration": 3.218675136566162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113128, + "balance_loss_mlp": 1.09423363, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.08063752220274202, + "language_loss": 0.87068301, + "learning_rate": 0.0006538565046699136, + "loss": 0.88181424, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.18884277, + "step": 2175, + "time_per_iteration": 2.623373031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110698, + "balance_loss_mlp": 1.09179151, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.10224918928766584, + "language_loss": 0.80967259, + "learning_rate": 0.0006535600487891862, + "loss": 0.82077956, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.18896484, + "step": 2176, + "time_per_iteration": 2.858027935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108436, + "balance_loss_mlp": 1.08948123, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.0620502143296578, + "language_loss": 0.88827038, + "learning_rate": 0.0006532635332911603, + "loss": 0.89935476, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.1895752, + "step": 2177, + "time_per_iteration": 2.6979219913482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126272, + "balance_loss_mlp": 1.10786629, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.06643064450406437, + "language_loss": 0.80475914, + "learning_rate": 0.0006529669582909541, + "loss": 0.81602192, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.18408203, + "step": 2178, + "time_per_iteration": 3.246621608734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112675, + "balance_loss_mlp": 1.10820079, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.08441696273800357, + "language_loss": 0.85626066, + "learning_rate": 0.0006526703239037077, + "loss": 0.8675282, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.1854248, + "step": 2179, + "time_per_iteration": 2.67114520072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10779428, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.07577304920294069, + "language_loss": 0.86212498, + "learning_rate": 0.0006523736302445851, + "loss": 0.8733927, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.18969727, + "step": 2180, + "time_per_iteration": 2.7883896827697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132192, + "balance_loss_mlp": 1.11371422, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.08559665169482955, + "language_loss": 0.77047896, + "learning_rate": 0.0006520768774287728, + "loss": 0.78180093, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.18469238, + "step": 2181, + "time_per_iteration": 3.777104616165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127516, + "balance_loss_mlp": 1.10862184, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06395892384144108, + "language_loss": 0.85356331, + "learning_rate": 0.0006517800655714806, + "loss": 0.86483848, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.18884277, + "step": 2182, + "time_per_iteration": 2.8449056148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116562, + "balance_loss_mlp": 1.09781027, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07104751702384272, + "language_loss": 0.85029149, + "learning_rate": 0.0006514831947879407, + "loss": 0.86145711, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.18737793, + "step": 2183, + "time_per_iteration": 2.990061044692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107738, + "balance_loss_mlp": 1.08917689, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.10339737087855795, + "language_loss": 0.78075212, + "learning_rate": 0.0006511862651934091, + "loss": 0.79182947, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.18566895, + "step": 2184, + "time_per_iteration": 3.0668697357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107089, + "balance_loss_mlp": 1.08805084, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.06769253041220206, + "language_loss": 0.8183164, + "learning_rate": 0.0006508892769031638, + "loss": 0.82938731, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.19018555, + "step": 2185, + "time_per_iteration": 2.6562998294830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109571, + "balance_loss_mlp": 1.0908668, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.09820566679610492, + "language_loss": 0.86607713, + "learning_rate": 0.000650592230032506, + "loss": 0.87717283, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.18676758, + "step": 2186, + "time_per_iteration": 2.7687323093414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_mlp": 1.09592557, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.07480815577141971, + "language_loss": 0.84954965, + "learning_rate": 0.0006502951246967595, + "loss": 0.86070257, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.19360352, + "step": 2187, + "time_per_iteration": 2.8850929737091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105112, + "balance_loss_mlp": 1.0856576, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.07526055561420332, + "language_loss": 0.86650884, + "learning_rate": 0.0006499979610112706, + "loss": 0.87756002, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.19445801, + "step": 2188, + "time_per_iteration": 2.6973655223846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09087813, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.09941258674264111, + "language_loss": 0.84241974, + "learning_rate": 0.000649700739091409, + "loss": 0.85352778, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.19921875, + "step": 2189, + "time_per_iteration": 2.701725482940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067004, + "balance_loss_mlp": 1.05665708, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.03283150548513283, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.7490328, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.10351562, + "step": 2190, + "time_per_iteration": 4.839817523956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.07154024, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.06598643326088396, + "language_loss": 0.85153967, + "learning_rate": 0.0006491061210101557, + "loss": 0.86244869, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.19335938, + "step": 2191, + "time_per_iteration": 2.7196173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010923, + "balance_loss_mlp": 1.07263041, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.0656106941658015, + "language_loss": 0.83940744, + "learning_rate": 0.0006488087250796157, + "loss": 0.85033047, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.1965332, + "step": 2192, + "time_per_iteration": 2.906759262084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_mlp": 1.07264447, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.07249831154737209, + "language_loss": 0.81628364, + "learning_rate": 0.0006485112713764049, + "loss": 0.82721323, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.203125, + "step": 2193, + "time_per_iteration": 2.92899227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094124, + "balance_loss_mlp": 1.0746212, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.06737861087768351, + "language_loss": 0.83769715, + "learning_rate": 0.0006482137600160051, + "loss": 0.8486383, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.19506836, + "step": 2194, + "time_per_iteration": 2.5262770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085984, + "balance_loss_mlp": 1.06623149, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.06292139363287808, + "language_loss": 0.845213, + "learning_rate": 0.0006479161911139206, + "loss": 0.85607278, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.1973877, + "step": 2195, + "time_per_iteration": 2.6160459518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108518, + "balance_loss_mlp": 1.06428266, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08901996634588341, + "language_loss": 0.8583566, + "learning_rate": 0.0006476185647856778, + "loss": 0.8692084, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.2088623, + "step": 2196, + "time_per_iteration": 2.5868523120880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092161, + "balance_loss_mlp": 1.07174015, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.08593083287674207, + "language_loss": 0.8143295, + "learning_rate": 0.0006473208811468255, + "loss": 0.8252511, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.20422363, + "step": 2197, + "time_per_iteration": 2.8999974727630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094808, + "balance_loss_mlp": 1.07459044, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.06766081582077942, + "language_loss": 0.84457636, + "learning_rate": 0.0006470231403129347, + "loss": 0.85552448, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.20214844, + "step": 2198, + "time_per_iteration": 2.6292834281921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_mlp": 1.08031106, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.06420895179660353, + "language_loss": 0.81433302, + "learning_rate": 0.0006467253423995988, + "loss": 0.82533306, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.19677734, + "step": 2199, + "time_per_iteration": 2.891252040863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106456, + "balance_loss_mlp": 1.08667946, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.09520170564639865, + "language_loss": 0.79070157, + "learning_rate": 0.000646427487522433, + "loss": 0.80176616, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.19763184, + "step": 2200, + "time_per_iteration": 2.6773481369018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114103, + "balance_loss_mlp": 1.09451675, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.05852623049494667, + "language_loss": 0.8313483, + "learning_rate": 0.0006461295757970749, + "loss": 0.84248924, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.19567871, + "step": 2201, + "time_per_iteration": 2.8689796924591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134362, + "balance_loss_mlp": 1.11422753, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.08800937436321304, + "language_loss": 0.8125912, + "learning_rate": 0.0006458316073391839, + "loss": 0.82393485, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.20141602, + "step": 2202, + "time_per_iteration": 2.88208270072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131451, + "balance_loss_mlp": 1.11259222, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.0666307669316128, + "language_loss": 0.87698853, + "learning_rate": 0.0006455335822644422, + "loss": 0.88830304, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.18847656, + "step": 2203, + "time_per_iteration": 2.670079469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148041, + "balance_loss_mlp": 1.12951636, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.09426146221356531, + "language_loss": 0.77927971, + "learning_rate": 0.0006452355006885527, + "loss": 0.79076016, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.18530273, + "step": 2204, + "time_per_iteration": 2.657381534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113566, + "balance_loss_mlp": 1.11668229, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.09902645475712538, + "language_loss": 0.8715145, + "learning_rate": 0.0006449373627272412, + "loss": 0.88287115, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.18969727, + "step": 2205, + "time_per_iteration": 2.731816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119544, + "balance_loss_mlp": 1.10088801, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.08117714281203407, + "language_loss": 0.82472396, + "learning_rate": 0.0006446391684962553, + "loss": 0.8359195, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.18652344, + "step": 2206, + "time_per_iteration": 2.6578545570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111897, + "balance_loss_mlp": 1.09364557, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.07468362398894425, + "language_loss": 0.83251357, + "learning_rate": 0.000644340918111364, + "loss": 0.84363258, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.18249512, + "step": 2207, + "time_per_iteration": 2.56805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116636, + "balance_loss_mlp": 1.09764564, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.07806782722385266, + "language_loss": 0.84652972, + "learning_rate": 0.0006440426116883585, + "loss": 0.85769606, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.18981934, + "step": 2208, + "time_per_iteration": 2.5546016693115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117381, + "balance_loss_mlp": 1.09860539, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.06957413499154663, + "language_loss": 0.86008334, + "learning_rate": 0.0006437442493430519, + "loss": 0.87125719, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.18762207, + "step": 2209, + "time_per_iteration": 2.709622621536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.09817648, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.07293604534963509, + "language_loss": 0.86852837, + "learning_rate": 0.000643445831191278, + "loss": 0.87969142, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.18127441, + "step": 2210, + "time_per_iteration": 2.9363558292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129839, + "balance_loss_mlp": 1.11201715, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.09052715570846585, + "language_loss": 0.81454134, + "learning_rate": 0.0006431473573488937, + "loss": 0.82583976, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.17834473, + "step": 2211, + "time_per_iteration": 2.824688196182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113072, + "balance_loss_mlp": 1.09480882, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.1062817873742978, + "language_loss": 0.8489396, + "learning_rate": 0.0006428488279317765, + "loss": 0.86007035, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.18273926, + "step": 2212, + "time_per_iteration": 2.7016141414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115888, + "balance_loss_mlp": 1.0979948, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.11732172807674658, + "language_loss": 0.87377149, + "learning_rate": 0.0006425502430558259, + "loss": 0.88493037, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.17907715, + "step": 2213, + "time_per_iteration": 2.618800640106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119747, + "balance_loss_mlp": 1.10144818, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.0715384053232906, + "language_loss": 0.84687829, + "learning_rate": 0.0006422516028369628, + "loss": 0.85807574, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.18310547, + "step": 2214, + "time_per_iteration": 2.6705808639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111255, + "balance_loss_mlp": 1.09299207, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.10790889315219483, + "language_loss": 0.83148849, + "learning_rate": 0.0006419529073911296, + "loss": 0.84260106, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.18261719, + "step": 2215, + "time_per_iteration": 2.8703150749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129195, + "balance_loss_mlp": 1.11081314, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.06359649877678734, + "language_loss": 0.85258245, + "learning_rate": 0.0006416541568342901, + "loss": 0.86387444, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.18383789, + "step": 2216, + "time_per_iteration": 2.8891868591308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150464, + "balance_loss_mlp": 1.13197434, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.08324056394919786, + "language_loss": 0.84084767, + "learning_rate": 0.0006413553512824297, + "loss": 0.85235232, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.18481445, + "step": 2217, + "time_per_iteration": 2.7485709190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.13043642, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.07361406588428895, + "language_loss": 0.84362692, + "learning_rate": 0.0006410564908515549, + "loss": 0.85511333, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.18200684, + "step": 2218, + "time_per_iteration": 2.657747507095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147496, + "balance_loss_mlp": 1.12895846, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.08313238940479123, + "language_loss": 0.85059869, + "learning_rate": 0.0006407575756576935, + "loss": 0.86207366, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.18530273, + "step": 2219, + "time_per_iteration": 2.7391462326049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151304, + "balance_loss_mlp": 1.13211131, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.08558880584649159, + "language_loss": 0.87292302, + "learning_rate": 0.0006404586058168951, + "loss": 0.88443601, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.19189453, + "step": 2220, + "time_per_iteration": 2.7562613487243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142405, + "balance_loss_mlp": 1.12310505, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.08712204240656665, + "language_loss": 0.86527437, + "learning_rate": 0.0006401595814452296, + "loss": 0.87669843, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.19287109, + "step": 2221, + "time_per_iteration": 2.6396138668060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120795, + "balance_loss_mlp": 1.10141122, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.07683160316407273, + "language_loss": 0.80591571, + "learning_rate": 0.000639860502658789, + "loss": 0.81712359, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.19360352, + "step": 2222, + "time_per_iteration": 2.655627489089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115469, + "balance_loss_mlp": 1.09618044, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.0619683298423062, + "language_loss": 0.85100698, + "learning_rate": 0.0006395613695736853, + "loss": 0.86216164, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.19287109, + "step": 2223, + "time_per_iteration": 2.701129674911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103811, + "balance_loss_mlp": 1.08472598, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.07797079059499014, + "language_loss": 0.81455553, + "learning_rate": 0.0006392621823060529, + "loss": 0.82559359, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.19067383, + "step": 2224, + "time_per_iteration": 2.7364578247070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099286, + "balance_loss_mlp": 1.08043897, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.08496205952123127, + "language_loss": 0.84790826, + "learning_rate": 0.0006389629409720465, + "loss": 0.85890114, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.18835449, + "step": 2225, + "time_per_iteration": 2.673173427581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109636, + "balance_loss_mlp": 1.07715571, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.0715414323965843, + "language_loss": 0.88466454, + "learning_rate": 0.0006386636456878417, + "loss": 0.89562809, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.19177246, + "step": 2226, + "time_per_iteration": 2.9119651317596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098271, + "balance_loss_mlp": 1.07898331, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.09078876082736503, + "language_loss": 0.91914666, + "learning_rate": 0.0006383642965696353, + "loss": 0.93012941, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.19262695, + "step": 2227, + "time_per_iteration": 2.546172618865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105185, + "balance_loss_mlp": 1.08565903, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.10289049243839221, + "language_loss": 0.83054781, + "learning_rate": 0.000638064893733645, + "loss": 0.84159964, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.19506836, + "step": 2228, + "time_per_iteration": 2.752192735671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110948, + "balance_loss_mlp": 1.09085989, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.15473525900744378, + "language_loss": 0.89614534, + "learning_rate": 0.000637765437296109, + "loss": 0.90724015, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.18615723, + "step": 2229, + "time_per_iteration": 2.6742892265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106608, + "balance_loss_mlp": 1.08742726, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.06911950421263405, + "language_loss": 0.8512131, + "learning_rate": 0.000637465927373287, + "loss": 0.86227918, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.19165039, + "step": 2230, + "time_per_iteration": 2.6567254066467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103693, + "balance_loss_mlp": 1.08500099, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.08280955993669904, + "language_loss": 0.78714275, + "learning_rate": 0.000637166364081459, + "loss": 0.79817969, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.18688965, + "step": 2231, + "time_per_iteration": 2.671881914138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118758, + "balance_loss_mlp": 1.10104382, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.10217834412041502, + "language_loss": 0.84177876, + "learning_rate": 0.0006368667475369256, + "loss": 0.85296631, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.17736816, + "step": 2232, + "time_per_iteration": 2.760406732559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_mlp": 1.03175175, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.029167273687310865, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79569829, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.10302734, + "step": 2233, + "time_per_iteration": 4.915542840957642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_mlp": 1.02887213, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.028672121204767892, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79934502, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.10205078, + "step": 2234, + "time_per_iteration": 4.8368518352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158883, + "balance_loss_mlp": 1.14040589, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.1071521836349002, + "language_loss": 0.85815042, + "learning_rate": 0.0006359675795504112, + "loss": 0.86973917, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.18481445, + "step": 2235, + "time_per_iteration": 2.689207077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157771, + "balance_loss_mlp": 1.13929391, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.08968188926211089, + "language_loss": 0.74473494, + "learning_rate": 0.0006356677511584775, + "loss": 0.75631261, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.18481445, + "step": 2236, + "time_per_iteration": 3.4835057258605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140943, + "balance_loss_mlp": 1.12231028, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.07661214353194774, + "language_loss": 0.86188674, + "learning_rate": 0.0006353678700956511, + "loss": 0.8732962, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.18615723, + "step": 2237, + "time_per_iteration": 2.5932724475860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122051, + "balance_loss_mlp": 1.10363352, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.10135375141644645, + "language_loss": 0.83612645, + "learning_rate": 0.0006350679364783569, + "loss": 0.84734702, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.1842041, + "step": 2238, + "time_per_iteration": 2.799832582473755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116492, + "balance_loss_mlp": 1.09846783, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.08578747749075483, + "language_loss": 0.85542685, + "learning_rate": 0.0006347679504230393, + "loss": 0.86659181, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.18041992, + "step": 2239, + "time_per_iteration": 2.692394971847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101167, + "balance_loss_mlp": 1.08270121, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.07961944034188723, + "language_loss": 0.76030314, + "learning_rate": 0.0006344679120461632, + "loss": 0.77131486, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.18444824, + "step": 2240, + "time_per_iteration": 3.3374927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095492, + "balance_loss_mlp": 1.07701421, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.0793940534533153, + "language_loss": 0.7985338, + "learning_rate": 0.0006341678214642134, + "loss": 0.80948877, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.18469238, + "step": 2241, + "time_per_iteration": 2.6277148723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106602, + "balance_loss_mlp": 1.08830297, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.08042276557968771, + "language_loss": 0.82835627, + "learning_rate": 0.0006338676787936963, + "loss": 0.83942229, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.18286133, + "step": 2242, + "time_per_iteration": 3.1297900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108169, + "balance_loss_mlp": 1.08982253, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.09204417916973401, + "language_loss": 0.8383373, + "learning_rate": 0.0006335674841511367, + "loss": 0.84941894, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.18347168, + "step": 2243, + "time_per_iteration": 2.667814254760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093207, + "balance_loss_mlp": 1.08414674, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.03538748768114217, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80274379, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.09082031, + "step": 2244, + "time_per_iteration": 4.997291803359985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085049, + "balance_loss_mlp": 1.07603705, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.03507908076143408, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78450596, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.09033203, + "step": 2245, + "time_per_iteration": 4.884565591812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114789, + "balance_loss_mlp": 1.09558439, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.08187280769981854, + "language_loss": 0.82496786, + "learning_rate": 0.0006326665895567652, + "loss": 0.83611572, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.1920166, + "step": 2246, + "time_per_iteration": 2.6677396297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123468, + "balance_loss_mlp": 1.10469246, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.08598825839477024, + "language_loss": 0.86984897, + "learning_rate": 0.0006323661881916976, + "loss": 0.88108367, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.1875, + "step": 2247, + "time_per_iteration": 2.7388386726379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117292, + "balance_loss_mlp": 1.09908867, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.06738996012815959, + "language_loss": 0.80918467, + "learning_rate": 0.0006320657354375179, + "loss": 0.82035756, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.18212891, + "step": 2248, + "time_per_iteration": 3.047557830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_mlp": 1.11192417, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.08033421843515161, + "language_loss": 0.86710787, + "learning_rate": 0.0006317652314108726, + "loss": 0.8784107, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.18347168, + "step": 2249, + "time_per_iteration": 2.547611713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121909, + "balance_loss_mlp": 1.10406351, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.07824522100123071, + "language_loss": 0.91323555, + "learning_rate": 0.0006314646762284277, + "loss": 0.92445469, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.17858887, + "step": 2250, + "time_per_iteration": 2.648721933364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024653, + "balance_loss_mlp": 1.01502049, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.012196079218770799, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76450479, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.09619141, + "step": 2251, + "time_per_iteration": 4.9720799922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134181, + "balance_loss_mlp": 1.11596584, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.07706489930265227, + "language_loss": 0.77657586, + "learning_rate": 0.0006308634128629022, + "loss": 0.78791773, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.18225098, + "step": 2252, + "time_per_iteration": 2.898723602294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131863, + "balance_loss_mlp": 1.11357653, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.09977200174188003, + "language_loss": 0.87270236, + "learning_rate": 0.0006305627049132531, + "loss": 0.88402092, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.18286133, + "step": 2253, + "time_per_iteration": 2.854081153869629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120951, + "balance_loss_mlp": 1.1019249, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.08155008814068082, + "language_loss": 0.8592571, + "learning_rate": 0.0006302619462746662, + "loss": 0.87046659, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.19018555, + "step": 2254, + "time_per_iteration": 3.164759397506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111221, + "balance_loss_mlp": 1.09445965, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.0732322900076577, + "language_loss": 0.90031815, + "learning_rate": 0.0006299611370639069, + "loss": 0.91144025, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.17773438, + "step": 2255, + "time_per_iteration": 2.753937005996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111399, + "balance_loss_mlp": 1.09258795, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.07459277492074774, + "language_loss": 0.79176068, + "learning_rate": 0.0006296602773977593, + "loss": 0.80287468, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.18798828, + "step": 2256, + "time_per_iteration": 2.720043659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111381, + "balance_loss_mlp": 1.09282053, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.06314614385855079, + "language_loss": 0.873402, + "learning_rate": 0.0006293593673930277, + "loss": 0.88451576, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.18566895, + "step": 2257, + "time_per_iteration": 2.7014408111572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102122, + "balance_loss_mlp": 1.0837394, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07573255135522808, + "language_loss": 0.78537059, + "learning_rate": 0.0006290584071665358, + "loss": 0.79639179, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.18371582, + "step": 2258, + "time_per_iteration": 2.9237425327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109661, + "balance_loss_mlp": 1.09070623, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.09488327166679841, + "language_loss": 0.82044512, + "learning_rate": 0.0006287573968351266, + "loss": 0.83154172, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.18945312, + "step": 2259, + "time_per_iteration": 2.574779748916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100612, + "balance_loss_mlp": 1.08195579, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.08898100409874855, + "language_loss": 0.82007015, + "learning_rate": 0.0006284563365156626, + "loss": 0.83107626, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.18652344, + "step": 2260, + "time_per_iteration": 2.8346612453460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107563, + "balance_loss_mlp": 1.08845389, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.09100088182337301, + "language_loss": 0.87183499, + "learning_rate": 0.0006281552263250261, + "loss": 0.88291061, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.19116211, + "step": 2261, + "time_per_iteration": 2.549306631088257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054669, + "balance_loss_mlp": 1.04460812, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.02508916228462863, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81746203, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.10058594, + "step": 2262, + "time_per_iteration": 4.837932348251343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104657, + "balance_loss_mlp": 1.08554804, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.08522062407758652, + "language_loss": 0.81544203, + "learning_rate": 0.0006275528567978593, + "loss": 0.82648861, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.19091797, + "step": 2263, + "time_per_iteration": 2.936924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112769, + "balance_loss_mlp": 1.09411263, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.07411268466258768, + "language_loss": 0.826931, + "learning_rate": 0.0006272515976951898, + "loss": 0.83805871, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.18640137, + "step": 2264, + "time_per_iteration": 3.0930423736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107636, + "balance_loss_mlp": 1.08872962, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.09109036690828846, + "language_loss": 0.79239774, + "learning_rate": 0.0006269502891890687, + "loss": 0.80347407, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.18896484, + "step": 2265, + "time_per_iteration": 3.0183792114257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107502, + "balance_loss_mlp": 1.08883369, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.05550243860706018, + "language_loss": 0.87779111, + "learning_rate": 0.0006266489313964743, + "loss": 0.88886613, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.18652344, + "step": 2266, + "time_per_iteration": 2.7831835746765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.10263872, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.0703513545387446, + "language_loss": 0.85298383, + "learning_rate": 0.0006263475244344041, + "loss": 0.86419421, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.18395996, + "step": 2267, + "time_per_iteration": 2.857132911682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118688, + "balance_loss_mlp": 1.10052013, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.08642791248778911, + "language_loss": 0.84379327, + "learning_rate": 0.0006260460684198746, + "loss": 0.85498011, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.1817627, + "step": 2268, + "time_per_iteration": 2.692237138748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107605, + "balance_loss_mlp": 1.08955705, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.0923795472926113, + "language_loss": 0.84379983, + "learning_rate": 0.0006257445634699213, + "loss": 0.85487592, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.18066406, + "step": 2269, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113893, + "balance_loss_mlp": 1.0958451, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.07185982898842977, + "language_loss": 0.82919574, + "learning_rate": 0.0006254430097015993, + "loss": 0.84033465, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.18054199, + "step": 2270, + "time_per_iteration": 2.70414662361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039786, + "balance_loss_mlp": 1.02981973, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.018847560898896817, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.7751888, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.09960938, + "step": 2271, + "time_per_iteration": 4.881477355957031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.09232235, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.06834440940873689, + "language_loss": 0.85169542, + "learning_rate": 0.0006248397561781609, + "loss": 0.86278993, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.17138672, + "step": 2272, + "time_per_iteration": 2.9807589054107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114956, + "balance_loss_mlp": 1.09752727, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.08779020279595867, + "language_loss": 0.85627788, + "learning_rate": 0.0006245380566572482, + "loss": 0.86742747, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.17456055, + "step": 2273, + "time_per_iteration": 2.6780998706817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113394, + "balance_loss_mlp": 1.09640646, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07304773845504615, + "language_loss": 0.75851929, + "learning_rate": 0.0006242363087863744, + "loss": 0.7696532, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.17004395, + "step": 2274, + "time_per_iteration": 2.9744510650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116168, + "balance_loss_mlp": 1.0989182, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.1377417309618575, + "language_loss": 0.86166036, + "learning_rate": 0.0006239345126826878, + "loss": 0.87282199, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.17272949, + "step": 2275, + "time_per_iteration": 2.7981135845184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108683, + "balance_loss_mlp": 1.09152877, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.07859590561046474, + "language_loss": 0.83992988, + "learning_rate": 0.0006236326684633561, + "loss": 0.8510167, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.17175293, + "step": 2276, + "time_per_iteration": 2.818861722946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112467, + "balance_loss_mlp": 1.09526503, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.07703424900820159, + "language_loss": 0.74875319, + "learning_rate": 0.0006233307762455658, + "loss": 0.75987786, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.17224121, + "step": 2277, + "time_per_iteration": 2.6329345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113593, + "balance_loss_mlp": 1.09641492, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.08103172587748399, + "language_loss": 0.83020627, + "learning_rate": 0.0006230288361465216, + "loss": 0.84134221, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.17199707, + "step": 2278, + "time_per_iteration": 3.093740701675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121877, + "balance_loss_mlp": 1.10465097, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.0865781646571655, + "language_loss": 0.8464967, + "learning_rate": 0.0006227268482834473, + "loss": 0.85771543, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.17248535, + "step": 2279, + "time_per_iteration": 2.9201176166534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125481, + "balance_loss_mlp": 1.10830259, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.07906200997295257, + "language_loss": 0.86881065, + "learning_rate": 0.000622424812773585, + "loss": 0.88006544, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.17199707, + "step": 2280, + "time_per_iteration": 2.8375024795532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133302, + "balance_loss_mlp": 1.11602879, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.07902412331438459, + "language_loss": 0.79696977, + "learning_rate": 0.000622122729734195, + "loss": 0.80830276, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.17285156, + "step": 2281, + "time_per_iteration": 2.587625741958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127945, + "balance_loss_mlp": 1.11082637, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.06489318495758713, + "language_loss": 0.87247634, + "learning_rate": 0.0006218205992825566, + "loss": 0.8837558, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.17138672, + "step": 2282, + "time_per_iteration": 2.6426842212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132622, + "balance_loss_mlp": 1.11561131, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.07249325505401696, + "language_loss": 0.81692946, + "learning_rate": 0.0006215184215359671, + "loss": 0.82825571, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.17016602, + "step": 2283, + "time_per_iteration": 2.7548625469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131603, + "balance_loss_mlp": 1.11440063, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.07525739768421633, + "language_loss": 0.86762762, + "learning_rate": 0.0006212161966117425, + "loss": 0.87894368, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.17224121, + "step": 2284, + "time_per_iteration": 2.738553762435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131945, + "balance_loss_mlp": 1.11446857, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.077553661572433, + "language_loss": 0.81615996, + "learning_rate": 0.0006209139246272164, + "loss": 0.82747942, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.17492676, + "step": 2285, + "time_per_iteration": 3.024388074874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133353, + "balance_loss_mlp": 1.11548376, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.07341525875363067, + "language_loss": 0.81566632, + "learning_rate": 0.0006206116056997421, + "loss": 0.8269999, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.17871094, + "step": 2286, + "time_per_iteration": 2.5751805305480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130534, + "balance_loss_mlp": 1.11304617, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.0674295682524957, + "language_loss": 0.82623774, + "learning_rate": 0.0006203092399466892, + "loss": 0.83754307, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.17504883, + "step": 2287, + "time_per_iteration": 2.557861566543579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142808, + "balance_loss_mlp": 1.12514091, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.055585597684010626, + "language_loss": 0.84940028, + "learning_rate": 0.0006200068274854473, + "loss": 0.8608284, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.17700195, + "step": 2288, + "time_per_iteration": 2.6604013442993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139053, + "balance_loss_mlp": 1.12110031, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.05756252195592342, + "language_loss": 0.85686207, + "learning_rate": 0.0006197043684334229, + "loss": 0.86825264, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.17956543, + "step": 2289, + "time_per_iteration": 2.7742552757263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136744, + "balance_loss_mlp": 1.11905324, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.09031384979596896, + "language_loss": 0.78885317, + "learning_rate": 0.0006194018629080411, + "loss": 0.80022061, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.17712402, + "step": 2290, + "time_per_iteration": 2.755141019821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143468, + "balance_loss_mlp": 1.12530041, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.10381992178140695, + "language_loss": 0.81444335, + "learning_rate": 0.0006190993110267451, + "loss": 0.82587808, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.18164062, + "step": 2291, + "time_per_iteration": 2.717245578765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138273, + "balance_loss_mlp": 1.1200701, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.06842071551306793, + "language_loss": 0.84298384, + "learning_rate": 0.0006187967129069958, + "loss": 0.8543666, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.18212891, + "step": 2292, + "time_per_iteration": 2.540931463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139526, + "balance_loss_mlp": 1.12121558, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.07329037094919502, + "language_loss": 0.86953282, + "learning_rate": 0.0006184940686662722, + "loss": 0.88092804, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.18322754, + "step": 2293, + "time_per_iteration": 2.7757341861724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140768, + "balance_loss_mlp": 1.1223979, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.08855099948535183, + "language_loss": 0.89983863, + "learning_rate": 0.0006181913784220714, + "loss": 0.9112463, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.18371582, + "step": 2294, + "time_per_iteration": 2.723515510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040412, + "balance_loss_mlp": 1.03092277, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.030293744399198016, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81594193, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.09472656, + "step": 2295, + "time_per_iteration": 4.940914630889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125035, + "balance_loss_mlp": 1.10708177, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.07282895932349266, + "language_loss": 0.79783386, + "learning_rate": 0.0006175858603933146, + "loss": 0.80908418, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.1796875, + "step": 2296, + "time_per_iteration": 2.9011893272399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117973, + "balance_loss_mlp": 1.09999609, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.07093452663269637, + "language_loss": 0.80995864, + "learning_rate": 0.0006172830328438416, + "loss": 0.82113832, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.17993164, + "step": 2297, + "time_per_iteration": 2.984313726425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115845, + "balance_loss_mlp": 1.09765363, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.06543332431983825, + "language_loss": 0.87005913, + "learning_rate": 0.0006169801597610572, + "loss": 0.8812176, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.18212891, + "step": 2298, + "time_per_iteration": 2.7446672916412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105385, + "balance_loss_mlp": 1.08803988, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.09691889340683667, + "language_loss": 0.89723885, + "learning_rate": 0.0006166772412625469, + "loss": 0.90829265, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.17358398, + "step": 2299, + "time_per_iteration": 2.8357315063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107801, + "balance_loss_mlp": 1.08962202, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.10216386732709903, + "language_loss": 0.81670028, + "learning_rate": 0.0006163742774659141, + "loss": 0.82777828, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.1817627, + "step": 2300, + "time_per_iteration": 2.886781692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095311, + "balance_loss_mlp": 1.07751346, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.07973359147829089, + "language_loss": 0.85959738, + "learning_rate": 0.0006160712684887801, + "loss": 0.87055051, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.17822266, + "step": 2301, + "time_per_iteration": 2.7916574478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109187, + "balance_loss_mlp": 1.07431102, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.06808021774790461, + "language_loss": 0.82115805, + "learning_rate": 0.0006157682144487832, + "loss": 0.83207679, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.17565918, + "step": 2302, + "time_per_iteration": 2.795738458633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094413, + "balance_loss_mlp": 1.07613826, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.0749153625811459, + "language_loss": 0.83107322, + "learning_rate": 0.0006154651154635793, + "loss": 0.84201735, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.18273926, + "step": 2303, + "time_per_iteration": 4.31014609336853 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090587, + "balance_loss_mlp": 1.07243156, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.07642073153592485, + "language_loss": 0.84451294, + "learning_rate": 0.0006151619716508421, + "loss": 0.8554188, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.18164062, + "step": 2304, + "time_per_iteration": 2.6006975173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090647, + "balance_loss_mlp": 1.07205081, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.07612741560937177, + "language_loss": 0.87099224, + "learning_rate": 0.0006148587831282625, + "loss": 0.8818987, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.18591309, + "step": 2305, + "time_per_iteration": 2.7009835243225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048095, + "balance_loss_mlp": 1.03808129, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.019656861653556033, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80224162, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.10009766, + "step": 2306, + "time_per_iteration": 4.9429686069488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108493, + "balance_loss_mlp": 1.06604683, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.07723488854599227, + "language_loss": 0.87132251, + "learning_rate": 0.0006142522724244255, + "loss": 0.88217181, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.18884277, + "step": 2307, + "time_per_iteration": 2.553419828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_mlp": 1.02589071, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.014915460519873193, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77520525, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.09912109, + "step": 2308, + "time_per_iteration": 4.877639055252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085887, + "balance_loss_mlp": 1.06711113, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.07688151387557987, + "language_loss": 0.77357888, + "learning_rate": 0.000613645584293942, + "loss": 0.78443772, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.18762207, + "step": 2309, + "time_per_iteration": 2.9022634029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.06968963, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.08682478727714991, + "language_loss": 0.83149701, + "learning_rate": 0.0006133421739881185, + "loss": 0.84238064, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.18664551, + "step": 2310, + "time_per_iteration": 2.6619491577148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090945, + "balance_loss_mlp": 1.07256329, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.08001840232131298, + "language_loss": 0.82499826, + "learning_rate": 0.0006130387196789605, + "loss": 0.8359077, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.18359375, + "step": 2311, + "time_per_iteration": 2.761312246322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081759, + "balance_loss_mlp": 1.06348383, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.06942740185124545, + "language_loss": 0.84283984, + "learning_rate": 0.0006127352214842795, + "loss": 0.85365742, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.1829834, + "step": 2312, + "time_per_iteration": 2.9890031814575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083785, + "balance_loss_mlp": 1.06565332, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.07063181629976649, + "language_loss": 0.85067087, + "learning_rate": 0.0006124316795219041, + "loss": 0.86150873, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.18139648, + "step": 2313, + "time_per_iteration": 2.7978243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.06714296, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.08238507288636325, + "language_loss": 0.82411474, + "learning_rate": 0.0006121280939096794, + "loss": 0.83496892, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.1829834, + "step": 2314, + "time_per_iteration": 2.767470121383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087652, + "balance_loss_mlp": 1.06994963, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.09711161856626577, + "language_loss": 0.87964773, + "learning_rate": 0.000611824464765468, + "loss": 0.89052415, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.17712402, + "step": 2315, + "time_per_iteration": 2.58632493019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019027, + "balance_loss_mlp": 1.00934732, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.012462298147770837, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79613966, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.09667969, + "step": 2316, + "time_per_iteration": 4.68027400970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097979, + "balance_loss_mlp": 1.08057404, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.09030294554601531, + "language_loss": 0.85568595, + "learning_rate": 0.000611217076352619, + "loss": 0.86666572, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.17419434, + "step": 2317, + "time_per_iteration": 2.8745946884155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.07860303, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.06320933370201777, + "language_loss": 0.83313119, + "learning_rate": 0.0006109133173197905, + "loss": 0.84409374, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.17675781, + "step": 2318, + "time_per_iteration": 2.719902515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104888, + "balance_loss_mlp": 1.08753085, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.07491768608262588, + "language_loss": 0.85073888, + "learning_rate": 0.0006106095152265935, + "loss": 0.86178774, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.17370605, + "step": 2319, + "time_per_iteration": 3.004857063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111563, + "balance_loss_mlp": 1.0939796, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.08385510801007982, + "language_loss": 0.84405756, + "learning_rate": 0.0006103056701909739, + "loss": 0.85517317, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.17602539, + "step": 2320, + "time_per_iteration": 2.966923475265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113274, + "balance_loss_mlp": 1.09577405, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.07685766834781843, + "language_loss": 0.8301264, + "learning_rate": 0.0006100017823308956, + "loss": 0.84125912, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.17504883, + "step": 2321, + "time_per_iteration": 3.204850196838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112805, + "balance_loss_mlp": 1.11025262, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.08670302679562208, + "language_loss": 0.79305983, + "learning_rate": 0.0006096978517643377, + "loss": 0.80434036, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.17797852, + "step": 2322, + "time_per_iteration": 2.860180139541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112379, + "balance_loss_mlp": 1.10644507, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.12580563915967458, + "language_loss": 0.83188093, + "learning_rate": 0.0006093938786092968, + "loss": 0.84311885, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.17358398, + "step": 2323, + "time_per_iteration": 2.64030122756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124017, + "balance_loss_mlp": 1.10691094, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.06761406024518349, + "language_loss": 0.89442849, + "learning_rate": 0.0006090898629837857, + "loss": 0.90566862, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.17126465, + "step": 2324, + "time_per_iteration": 2.8378353118896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137759, + "balance_loss_mlp": 1.1204021, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.1896235015526922, + "language_loss": 0.87233531, + "learning_rate": 0.0006087858050058337, + "loss": 0.88371289, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.17370605, + "step": 2325, + "time_per_iteration": 2.829404830932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131095, + "balance_loss_mlp": 1.1135118, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.07181125336629572, + "language_loss": 0.82417965, + "learning_rate": 0.0006084817047934866, + "loss": 0.83549058, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.17590332, + "step": 2326, + "time_per_iteration": 2.68251371383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134989, + "balance_loss_mlp": 1.11732209, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.08385131470703, + "language_loss": 0.89333081, + "learning_rate": 0.0006081775624648066, + "loss": 0.90468073, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.17675781, + "step": 2327, + "time_per_iteration": 2.533090591430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138131, + "balance_loss_mlp": 1.12101269, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.10743629798598615, + "language_loss": 0.82534277, + "learning_rate": 0.0006078733781378721, + "loss": 0.83672416, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.17138672, + "step": 2328, + "time_per_iteration": 2.597377061843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111818, + "balance_loss_mlp": 1.10090625, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.07758231479291984, + "language_loss": 0.82049984, + "learning_rate": 0.0006075691519307781, + "loss": 0.83168161, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.17297363, + "step": 2329, + "time_per_iteration": 2.8866052627563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110227, + "balance_loss_mlp": 1.09251261, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.0702768888062288, + "language_loss": 0.81606984, + "learning_rate": 0.0006072648839616356, + "loss": 0.82717204, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.17724609, + "step": 2330, + "time_per_iteration": 2.7015554904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114358, + "balance_loss_mlp": 1.09686995, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.07321658937944422, + "language_loss": 0.82347071, + "learning_rate": 0.0006069605743485718, + "loss": 0.83461428, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.1751709, + "step": 2331, + "time_per_iteration": 3.3698229789733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110303, + "balance_loss_mlp": 1.09319615, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.07314304322377065, + "language_loss": 0.83288682, + "learning_rate": 0.0006066562232097303, + "loss": 0.84398985, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.17126465, + "step": 2332, + "time_per_iteration": 2.7595224380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109542, + "balance_loss_mlp": 1.09135079, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.07260034454336384, + "language_loss": 0.86063141, + "learning_rate": 0.0006063518306632708, + "loss": 0.87172687, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.18200684, + "step": 2333, + "time_per_iteration": 2.973802089691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110869, + "balance_loss_mlp": 1.09335709, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.0724353146925312, + "language_loss": 0.82143402, + "learning_rate": 0.0006060473968273688, + "loss": 0.83254278, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.1751709, + "step": 2334, + "time_per_iteration": 2.716792583465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_mlp": 1.02476275, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.01941960869972046, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78913647, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.09326172, + "step": 2335, + "time_per_iteration": 4.891199827194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025292, + "balance_loss_mlp": 1.01608956, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.01646335982957884, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82030511, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.09179688, + "step": 2336, + "time_per_iteration": 4.873430013656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112894, + "balance_loss_mlp": 1.09513164, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.18670212144629325, + "language_loss": 0.88409269, + "learning_rate": 0.0006051338487650047, + "loss": 0.89522159, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.17785645, + "step": 2337, + "time_per_iteration": 2.4702365398406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106326, + "balance_loss_mlp": 1.08833754, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.08397051973497069, + "language_loss": 0.82701272, + "learning_rate": 0.0006048292509534095, + "loss": 0.83807594, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.17993164, + "step": 2338, + "time_per_iteration": 2.619450569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_mlp": 1.08850312, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.20046859342765924, + "language_loss": 0.77607334, + "learning_rate": 0.0006045246124434895, + "loss": 0.78713191, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.17370605, + "step": 2339, + "time_per_iteration": 2.7321267127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105214, + "balance_loss_mlp": 1.08761835, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.08075651314496221, + "language_loss": 0.865839, + "learning_rate": 0.0006042199333535162, + "loss": 0.8768912, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.17614746, + "step": 2340, + "time_per_iteration": 3.306898832321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_mlp": 1.08355892, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06823291393488413, + "language_loss": 0.83802176, + "learning_rate": 0.0006039152138017763, + "loss": 0.84903181, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.17443848, + "step": 2341, + "time_per_iteration": 3.1458027362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104806, + "balance_loss_mlp": 1.08727062, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.08305826290941032, + "language_loss": 0.83554494, + "learning_rate": 0.0006036104539065726, + "loss": 0.84659296, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.17541504, + "step": 2342, + "time_per_iteration": 2.6648519039154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102434, + "balance_loss_mlp": 1.08492208, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.06158872344302024, + "language_loss": 0.84248793, + "learning_rate": 0.000603305653786223, + "loss": 0.85351223, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.17529297, + "step": 2343, + "time_per_iteration": 3.176680326461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113287, + "balance_loss_mlp": 1.09581113, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.0747059506481359, + "language_loss": 0.84228522, + "learning_rate": 0.0006030008135590622, + "loss": 0.85341805, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.17480469, + "step": 2344, + "time_per_iteration": 2.742253065109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124309, + "balance_loss_mlp": 1.10722649, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.058134829204836994, + "language_loss": 0.799905, + "learning_rate": 0.0006026959333434387, + "loss": 0.81114811, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.17102051, + "step": 2345, + "time_per_iteration": 2.779311180114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132846, + "balance_loss_mlp": 1.11552477, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.07509063772314063, + "language_loss": 0.77367598, + "learning_rate": 0.0006023910132577181, + "loss": 0.78500438, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.17346191, + "step": 2346, + "time_per_iteration": 2.6779799461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113297, + "balance_loss_mlp": 1.11554205, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.10491289793116987, + "language_loss": 0.84559381, + "learning_rate": 0.0006020860534202806, + "loss": 0.85692352, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.17443848, + "step": 2347, + "time_per_iteration": 2.528663158416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135795, + "balance_loss_mlp": 1.1183548, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.07098609761882418, + "language_loss": 0.80898821, + "learning_rate": 0.0006017810539495224, + "loss": 0.82034618, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.17468262, + "step": 2348, + "time_per_iteration": 2.9910202026367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111241, + "balance_loss_mlp": 1.09382474, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.07527105168067424, + "language_loss": 0.82186049, + "learning_rate": 0.0006014760149638547, + "loss": 0.83297288, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.17431641, + "step": 2349, + "time_per_iteration": 2.667600631713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124509, + "balance_loss_mlp": 1.10764134, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.07463444501983019, + "language_loss": 0.88244182, + "learning_rate": 0.000601170936581704, + "loss": 0.89368689, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.16870117, + "step": 2350, + "time_per_iteration": 2.5531952381134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124884, + "balance_loss_mlp": 1.10763478, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.07303827993658786, + "language_loss": 0.84088361, + "learning_rate": 0.0006008658189215121, + "loss": 0.85213244, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.17260742, + "step": 2351, + "time_per_iteration": 2.6667087078094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122786, + "balance_loss_mlp": 1.10538173, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.08019313993326724, + "language_loss": 0.80211049, + "learning_rate": 0.0006005606621017366, + "loss": 0.81333834, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.17419434, + "step": 2352, + "time_per_iteration": 2.5864298343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.09249294, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.08588176709504687, + "language_loss": 0.80108917, + "learning_rate": 0.0006002554662408496, + "loss": 0.81219029, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.1763916, + "step": 2353, + "time_per_iteration": 2.921902656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106203, + "balance_loss_mlp": 1.08839345, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.08839686088246723, + "language_loss": 0.91245115, + "learning_rate": 0.0005999502314573388, + "loss": 0.92351323, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.17822266, + "step": 2354, + "time_per_iteration": 2.6538503170013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098374, + "balance_loss_mlp": 1.08077872, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.07972814176434397, + "language_loss": 0.85777891, + "learning_rate": 0.0005996449578697066, + "loss": 0.86876267, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.17590332, + "step": 2355, + "time_per_iteration": 2.6249992847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112931, + "balance_loss_mlp": 1.09541893, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.0715197090101731, + "language_loss": 0.81223947, + "learning_rate": 0.0005993396455964709, + "loss": 0.82336879, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.17541504, + "step": 2356, + "time_per_iteration": 2.69350266456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111792, + "balance_loss_mlp": 1.0944469, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.07234166204840274, + "language_loss": 0.81097758, + "learning_rate": 0.0005990342947561647, + "loss": 0.82209545, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.17358398, + "step": 2357, + "time_per_iteration": 2.7173328399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123689, + "balance_loss_mlp": 1.10639215, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.09230022277941517, + "language_loss": 0.78124547, + "learning_rate": 0.0005987289054673351, + "loss": 0.79248238, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.1730957, + "step": 2358, + "time_per_iteration": 2.633007526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108071, + "balance_loss_mlp": 1.09800935, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.0537090739321762, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77683806, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.10058594, + "step": 2359, + "time_per_iteration": 4.852884769439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011235, + "balance_loss_mlp": 1.10622633, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.07905851512069884, + "language_loss": 0.91134411, + "learning_rate": 0.0005981180120183722, + "loss": 0.92257917, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.17285156, + "step": 2360, + "time_per_iteration": 2.7044413089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119068, + "balance_loss_mlp": 1.10053074, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.05732939327341075, + "language_loss": 0.85087699, + "learning_rate": 0.0005978125080954089, + "loss": 0.8620677, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.18530273, + "step": 2361, + "time_per_iteration": 2.775712251663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.08805668, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.0789619101325961, + "language_loss": 0.7727446, + "learning_rate": 0.000597506966198262, + "loss": 0.78380114, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.17614746, + "step": 2362, + "time_per_iteration": 2.974111557006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110357, + "balance_loss_mlp": 1.08590329, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.0858902108709268, + "language_loss": 0.83994937, + "learning_rate": 0.0005972013864455536, + "loss": 0.85098517, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.17675781, + "step": 2363, + "time_per_iteration": 2.6244583129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101233, + "balance_loss_mlp": 1.08366108, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.08015454662625561, + "language_loss": 0.851372, + "learning_rate": 0.0005968957689559203, + "loss": 0.86238432, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.17602539, + "step": 2364, + "time_per_iteration": 2.6717097759246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098129, + "balance_loss_mlp": 1.08035493, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.07229553193462525, + "language_loss": 0.88592815, + "learning_rate": 0.0005965901138480131, + "loss": 0.89690942, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.17785645, + "step": 2365, + "time_per_iteration": 2.653158664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098766, + "balance_loss_mlp": 1.08063412, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.07319480450828385, + "language_loss": 0.87207007, + "learning_rate": 0.0005962844212404982, + "loss": 0.88305777, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.18151855, + "step": 2366, + "time_per_iteration": 2.727456569671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110146, + "balance_loss_mlp": 1.0928843, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.06525288256406295, + "language_loss": 0.87264466, + "learning_rate": 0.0005959786912520558, + "loss": 0.88374615, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.17285156, + "step": 2367, + "time_per_iteration": 2.6637766361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.08999324, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.061777726879510934, + "language_loss": 0.8370434, + "learning_rate": 0.0005956729240013806, + "loss": 0.84811896, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.17565918, + "step": 2368, + "time_per_iteration": 2.815329074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107603, + "balance_loss_mlp": 1.08967423, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.07604266440979088, + "language_loss": 0.91824389, + "learning_rate": 0.0005953671196071824, + "loss": 0.92931986, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.17944336, + "step": 2369, + "time_per_iteration": 2.711060047149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111288, + "balance_loss_mlp": 1.09501028, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.06552470471898014, + "language_loss": 0.80047917, + "learning_rate": 0.0005950612781881846, + "loss": 0.81160796, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.17871094, + "step": 2370, + "time_per_iteration": 2.710073709487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108328, + "balance_loss_mlp": 1.09072089, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.1576706166146413, + "language_loss": 0.75711769, + "learning_rate": 0.0005947553998631259, + "loss": 0.76820099, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.1763916, + "step": 2371, + "time_per_iteration": 2.855384588241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098527, + "balance_loss_mlp": 1.08041906, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.056716395855559716, + "language_loss": 0.78911364, + "learning_rate": 0.000594449484750758, + "loss": 0.8000989, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.18127441, + "step": 2372, + "time_per_iteration": 4.694324493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095337, + "balance_loss_mlp": 1.07693148, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.07402703052898342, + "language_loss": 0.82845718, + "learning_rate": 0.0005941435329698484, + "loss": 0.83941054, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.18395996, + "step": 2373, + "time_per_iteration": 2.677161693572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094792, + "balance_loss_mlp": 1.07592094, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.07242003224557565, + "language_loss": 0.82777703, + "learning_rate": 0.0005938375446391778, + "loss": 0.83872497, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.18847656, + "step": 2374, + "time_per_iteration": 2.6986706256866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094981, + "balance_loss_mlp": 1.07626557, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.09602017850343586, + "language_loss": 0.88724887, + "learning_rate": 0.0005935315198775415, + "loss": 0.89819872, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.18713379, + "step": 2375, + "time_per_iteration": 2.6160995960235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097379, + "balance_loss_mlp": 1.07811522, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.07644315743317759, + "language_loss": 0.86640108, + "learning_rate": 0.0005932254588037486, + "loss": 0.87737489, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.19262695, + "step": 2376, + "time_per_iteration": 2.5169382095336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097089, + "balance_loss_mlp": 1.07751513, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.07850584285058836, + "language_loss": 0.86183727, + "learning_rate": 0.000592919361536623, + "loss": 0.87280822, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.19580078, + "step": 2377, + "time_per_iteration": 2.668555498123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099166, + "balance_loss_mlp": 1.07996106, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.07491389260925961, + "language_loss": 0.89019889, + "learning_rate": 0.0005926132281950017, + "loss": 0.90119052, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.19213867, + "step": 2378, + "time_per_iteration": 2.7553632259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098644, + "balance_loss_mlp": 1.07934439, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.07088499852096378, + "language_loss": 0.84996307, + "learning_rate": 0.0005923070588977367, + "loss": 0.86094952, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.19287109, + "step": 2379, + "time_per_iteration": 2.8268253803253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105163, + "balance_loss_mlp": 1.08666205, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.08663232567685626, + "language_loss": 0.85752875, + "learning_rate": 0.0005920008537636931, + "loss": 0.86858034, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.18493652, + "step": 2380, + "time_per_iteration": 2.9154610633850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111966, + "balance_loss_mlp": 1.09322584, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.06304298978525442, + "language_loss": 0.86810696, + "learning_rate": 0.0005916946129117504, + "loss": 0.87922657, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.18725586, + "step": 2381, + "time_per_iteration": 2.9332261085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116157, + "balance_loss_mlp": 1.09857368, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.07662767679861947, + "language_loss": 0.81012738, + "learning_rate": 0.0005913883364608017, + "loss": 0.821289, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.17602539, + "step": 2382, + "time_per_iteration": 3.0999624729156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122668, + "balance_loss_mlp": 1.104954, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.07647659587343762, + "language_loss": 0.88500929, + "learning_rate": 0.0005910820245297542, + "loss": 0.89623594, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.17724609, + "step": 2383, + "time_per_iteration": 2.8880879878997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124722, + "balance_loss_mlp": 1.10707903, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.0900951330432027, + "language_loss": 0.80609989, + "learning_rate": 0.000590775677237529, + "loss": 0.81734717, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.17651367, + "step": 2384, + "time_per_iteration": 2.758249044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133426, + "balance_loss_mlp": 1.11639071, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.08076424564900554, + "language_loss": 0.79984713, + "learning_rate": 0.0005904692947030601, + "loss": 0.81118137, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.17053223, + "step": 2385, + "time_per_iteration": 2.667224168777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129309, + "balance_loss_mlp": 1.11242914, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.10079326608985974, + "language_loss": 0.89998889, + "learning_rate": 0.0005901628770452963, + "loss": 0.91128194, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.16894531, + "step": 2386, + "time_per_iteration": 2.5951790809631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129955, + "balance_loss_mlp": 1.1131345, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.06835358350560915, + "language_loss": 0.87016714, + "learning_rate": 0.000589856424383199, + "loss": 0.88146669, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.16833496, + "step": 2387, + "time_per_iteration": 2.6031622886657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.1086055, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.07768127603303249, + "language_loss": 0.82945853, + "learning_rate": 0.000589549936835744, + "loss": 0.84071612, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.17175293, + "step": 2388, + "time_per_iteration": 2.903437376022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112224, + "balance_loss_mlp": 1.10476351, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06100287690428954, + "language_loss": 0.78894806, + "learning_rate": 0.0005892434145219202, + "loss": 0.80017042, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.17504883, + "step": 2389, + "time_per_iteration": 2.61372709274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104771, + "balance_loss_mlp": 1.08758104, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.07434011004541237, + "language_loss": 0.8214674, + "learning_rate": 0.0005889368575607303, + "loss": 0.83251518, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.17211914, + "step": 2390, + "time_per_iteration": 2.894376039505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113067, + "balance_loss_mlp": 1.09576964, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.08125857985315703, + "language_loss": 0.78747576, + "learning_rate": 0.00058863026607119, + "loss": 0.79860646, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.1730957, + "step": 2391, + "time_per_iteration": 3.112093210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118062, + "balance_loss_mlp": 1.10093117, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.08788037013511367, + "language_loss": 0.7955699, + "learning_rate": 0.0005883236401723287, + "loss": 0.80675054, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.17150879, + "step": 2392, + "time_per_iteration": 3.242553472518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09295249, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.08816777762822899, + "language_loss": 0.84516722, + "learning_rate": 0.0005880169799831893, + "loss": 0.8562752, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.17858887, + "step": 2393, + "time_per_iteration": 2.6654422283172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098957, + "balance_loss_mlp": 1.08111119, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.10997873970116459, + "language_loss": 0.81234348, + "learning_rate": 0.0005877102856228278, + "loss": 0.82333302, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.17858887, + "step": 2394, + "time_per_iteration": 2.873918294906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103019, + "balance_loss_mlp": 1.08542323, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.07484934817589016, + "language_loss": 0.84600067, + "learning_rate": 0.0005874035572103133, + "loss": 0.85703087, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.17602539, + "step": 2395, + "time_per_iteration": 2.6604511737823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106186, + "balance_loss_mlp": 1.08816206, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.09236346174205023, + "language_loss": 0.82285452, + "learning_rate": 0.0005870967948647288, + "loss": 0.83391643, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.18041992, + "step": 2396, + "time_per_iteration": 2.805236339569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088357, + "balance_loss_mlp": 1.0784868, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.0372592343397745, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75396657, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.09863281, + "step": 2397, + "time_per_iteration": 5.380864143371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114264, + "balance_loss_mlp": 1.09671664, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.08046670019017348, + "language_loss": 0.85787129, + "learning_rate": 0.0005864831688507443, + "loss": 0.86901391, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.17553711, + "step": 2398, + "time_per_iteration": 3.1147820949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119108, + "balance_loss_mlp": 1.10053492, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.08636966322347801, + "language_loss": 0.75248241, + "learning_rate": 0.0005861763054205754, + "loss": 0.76367348, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.18566895, + "step": 2399, + "time_per_iteration": 2.787648916244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126018, + "balance_loss_mlp": 1.10773087, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.07252969708721291, + "language_loss": 0.80419457, + "learning_rate": 0.0005858694085337976, + "loss": 0.81545472, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.18273926, + "step": 2400, + "time_per_iteration": 2.859846591949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113297, + "balance_loss_mlp": 1.09409237, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.08888433146403377, + "language_loss": 0.83730817, + "learning_rate": 0.0005855624783095589, + "loss": 0.84844118, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.19189453, + "step": 2401, + "time_per_iteration": 2.5447638034820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107409, + "balance_loss_mlp": 1.08806109, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.06969383703523749, + "language_loss": 0.85055763, + "learning_rate": 0.00058525551486702, + "loss": 0.86163163, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.19335938, + "step": 2402, + "time_per_iteration": 2.5561320781707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099434, + "balance_loss_mlp": 1.08090901, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.0974904106662223, + "language_loss": 0.80911911, + "learning_rate": 0.0005849485183253548, + "loss": 0.82011348, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.18530273, + "step": 2403, + "time_per_iteration": 2.6459126472473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099055, + "balance_loss_mlp": 1.08017266, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.06563821415676413, + "language_loss": 0.87331611, + "learning_rate": 0.0005846414888037501, + "loss": 0.88430667, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.18896484, + "step": 2404, + "time_per_iteration": 2.5333003997802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091218, + "balance_loss_mlp": 1.07249045, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.06903002712252786, + "language_loss": 0.82273191, + "learning_rate": 0.0005843344264214049, + "loss": 0.83364403, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.18701172, + "step": 2405, + "time_per_iteration": 2.806748628616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103621, + "balance_loss_mlp": 1.08491707, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.07210099338506677, + "language_loss": 0.84715909, + "learning_rate": 0.0005840273312975317, + "loss": 0.8581953, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.18701172, + "step": 2406, + "time_per_iteration": 2.884800910949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113888, + "balance_loss_mlp": 1.09550619, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.08103405236073111, + "language_loss": 0.90235025, + "learning_rate": 0.0005837202035513555, + "loss": 0.9134891, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.18383789, + "step": 2407, + "time_per_iteration": 2.609774351119995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114389, + "balance_loss_mlp": 1.09645963, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.08825825577707168, + "language_loss": 0.81317043, + "learning_rate": 0.0005834130433021136, + "loss": 0.8243143, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.17932129, + "step": 2408, + "time_per_iteration": 2.775449514389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109877, + "balance_loss_mlp": 1.09179258, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.07528135433799624, + "language_loss": 0.73480821, + "learning_rate": 0.0005831058506690563, + "loss": 0.74590695, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.18078613, + "step": 2409, + "time_per_iteration": 2.675328254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104926, + "balance_loss_mlp": 1.08739018, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.06500990989470928, + "language_loss": 0.85772568, + "learning_rate": 0.0005827986257714464, + "loss": 0.86877489, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.17541504, + "step": 2410, + "time_per_iteration": 2.934680461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106367, + "balance_loss_mlp": 1.0885216, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.1078033090301908, + "language_loss": 0.88550043, + "learning_rate": 0.0005824913687285591, + "loss": 0.89656413, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.17858887, + "step": 2411, + "time_per_iteration": 2.74306058883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101516, + "balance_loss_mlp": 1.08387256, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.08594294380237487, + "language_loss": 0.81337988, + "learning_rate": 0.0005821840796596821, + "loss": 0.82439506, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.17663574, + "step": 2412, + "time_per_iteration": 2.7274651527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105841, + "balance_loss_mlp": 1.08832955, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.05827694326073197, + "language_loss": 0.80418169, + "learning_rate": 0.0005818767586841158, + "loss": 0.81524014, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.1751709, + "step": 2413, + "time_per_iteration": 2.779078722000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109963, + "balance_loss_mlp": 1.09252286, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.06834094492641501, + "language_loss": 0.86072665, + "learning_rate": 0.0005815694059211726, + "loss": 0.87182629, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.17456055, + "step": 2414, + "time_per_iteration": 2.7060773372650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022324, + "balance_loss_mlp": 1.01297832, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.02599871836797638, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81895959, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.09326172, + "step": 2415, + "time_per_iteration": 4.83809757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018248, + "balance_loss_mlp": 1.00894976, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.022144294594628845, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.7796331, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.09277344, + "step": 2416, + "time_per_iteration": 4.993790626525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135903, + "balance_loss_mlp": 1.11812854, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.10260058932365836, + "language_loss": 0.862611, + "learning_rate": 0.0005806471581013931, + "loss": 0.87397003, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.17785645, + "step": 2417, + "time_per_iteration": 2.689473867416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_mlp": 1.12353921, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.08959237751331865, + "language_loss": 0.78271216, + "learning_rate": 0.0005803396793823146, + "loss": 0.79413325, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.18579102, + "step": 2418, + "time_per_iteration": 2.8183717727661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126782, + "balance_loss_mlp": 1.10836434, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.10270562971795844, + "language_loss": 0.85666251, + "learning_rate": 0.0005800321694726065, + "loss": 0.86793029, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.1842041, + "step": 2419, + "time_per_iteration": 2.797482490539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116623, + "balance_loss_mlp": 1.09855139, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.0731094360896604, + "language_loss": 0.86679709, + "learning_rate": 0.0005797246284916545, + "loss": 0.8779633, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.1809082, + "step": 2420, + "time_per_iteration": 2.707942008972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054766, + "balance_loss_mlp": 1.04570651, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.038938158808133214, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78559953, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.09082031, + "step": 2421, + "time_per_iteration": 4.987195253372192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094388, + "balance_loss_mlp": 1.07617295, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.09940681141683862, + "language_loss": 0.87739611, + "learning_rate": 0.0005791094537936233, + "loss": 0.88833994, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.18237305, + "step": 2422, + "time_per_iteration": 2.7631046772003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091469, + "balance_loss_mlp": 1.07345629, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.06779180589479097, + "language_loss": 0.8166219, + "learning_rate": 0.0005788018203153762, + "loss": 0.82753664, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.18017578, + "step": 2423, + "time_per_iteration": 2.6615488529205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085208, + "balance_loss_mlp": 1.06742215, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.08426811135055082, + "language_loss": 0.85527384, + "learning_rate": 0.000578494156243549, + "loss": 0.86612594, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.17810059, + "step": 2424, + "time_per_iteration": 2.6183924674987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089098, + "balance_loss_mlp": 1.07045364, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.08457394710823794, + "language_loss": 0.89275956, + "learning_rate": 0.0005781864616975878, + "loss": 0.90365046, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.18640137, + "step": 2425, + "time_per_iteration": 2.6595993041992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096807, + "balance_loss_mlp": 1.07906842, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.0955155738973633, + "language_loss": 0.84080482, + "learning_rate": 0.0005778787367969502, + "loss": 0.8517729, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.17749023, + "step": 2426, + "time_per_iteration": 2.573312759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010984, + "balance_loss_mlp": 1.08017302, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.07224995984565184, + "language_loss": 0.81008911, + "learning_rate": 0.0005775709816611053, + "loss": 0.82107311, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.18237305, + "step": 2427, + "time_per_iteration": 2.9737117290496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096193, + "balance_loss_mlp": 1.07804918, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.0630888064205099, + "language_loss": 0.83649611, + "learning_rate": 0.0005772631964095346, + "loss": 0.84745806, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.18151855, + "step": 2428, + "time_per_iteration": 2.7121798992156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108204, + "balance_loss_mlp": 1.09003639, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.07098479359046088, + "language_loss": 0.85673976, + "learning_rate": 0.000576955381161731, + "loss": 0.86782181, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.18164062, + "step": 2429, + "time_per_iteration": 2.7059943675994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102277, + "balance_loss_mlp": 1.08414483, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.07900180679196234, + "language_loss": 0.86017609, + "learning_rate": 0.0005766475360371985, + "loss": 0.87119883, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.18115234, + "step": 2430, + "time_per_iteration": 2.5818653106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106318, + "balance_loss_mlp": 1.08826935, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.07907770586360956, + "language_loss": 0.8455205, + "learning_rate": 0.0005763396611554536, + "loss": 0.85658371, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.18066406, + "step": 2431, + "time_per_iteration": 2.6773664951324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109521, + "balance_loss_mlp": 1.09193754, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.09111480047327246, + "language_loss": 0.79973984, + "learning_rate": 0.0005760317566360237, + "loss": 0.81083506, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.17602539, + "step": 2432, + "time_per_iteration": 3.014580726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114014, + "balance_loss_mlp": 1.09622765, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.0789075933194326, + "language_loss": 0.85020924, + "learning_rate": 0.000575723822598448, + "loss": 0.86134946, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.17785645, + "step": 2433, + "time_per_iteration": 2.8005478382110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111562, + "balance_loss_mlp": 1.09765542, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.07367233066443238, + "language_loss": 0.8147794, + "learning_rate": 0.0005754158591622773, + "loss": 0.82593554, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.1796875, + "step": 2434, + "time_per_iteration": 3.0118775367736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011158, + "balance_loss_mlp": 1.09752536, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.07922373152064655, + "language_loss": 0.82327235, + "learning_rate": 0.0005751078664470732, + "loss": 0.83443034, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.18286133, + "step": 2435, + "time_per_iteration": 2.5390684604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116733, + "balance_loss_mlp": 1.09935236, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.07859313369065737, + "language_loss": 0.85868919, + "learning_rate": 0.0005747998445724094, + "loss": 0.86985648, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.17382812, + "step": 2436, + "time_per_iteration": 2.6606297492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112946, + "balance_loss_mlp": 1.11235368, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.10622400322266522, + "language_loss": 0.8919673, + "learning_rate": 0.0005744917936578707, + "loss": 0.90326178, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.17126465, + "step": 2437, + "time_per_iteration": 2.8204565048217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121886, + "balance_loss_mlp": 1.10436273, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.06508472909978535, + "language_loss": 0.8377744, + "learning_rate": 0.0005741837138230526, + "loss": 0.8489933, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.17553711, + "step": 2438, + "time_per_iteration": 2.781350612640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122059, + "balance_loss_mlp": 1.10464203, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.06834159619761165, + "language_loss": 0.86276829, + "learning_rate": 0.0005738756051875627, + "loss": 0.87398893, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.17431641, + "step": 2439, + "time_per_iteration": 3.121708631515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131074, + "balance_loss_mlp": 1.11383653, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.07303953933220877, + "language_loss": 0.82923281, + "learning_rate": 0.0005735674678710192, + "loss": 0.84054363, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.17260742, + "step": 2440, + "time_per_iteration": 2.749302864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122683, + "balance_loss_mlp": 1.1051836, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.1547549936477752, + "language_loss": 0.80928504, + "learning_rate": 0.0005732593019930517, + "loss": 0.82051194, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.17504883, + "step": 2441, + "time_per_iteration": 2.9091122150421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137766, + "balance_loss_mlp": 1.12098181, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.0743256165664551, + "language_loss": 0.87914228, + "learning_rate": 0.0005729511076733008, + "loss": 0.89051992, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.16796875, + "step": 2442, + "time_per_iteration": 2.728706121444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140336, + "balance_loss_mlp": 1.12288404, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.07419109808583535, + "language_loss": 0.84796697, + "learning_rate": 0.000572642885031418, + "loss": 0.85937035, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.17456055, + "step": 2443, + "time_per_iteration": 2.8746440410614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134354, + "balance_loss_mlp": 1.11715245, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.10756822588652355, + "language_loss": 0.80578518, + "learning_rate": 0.0005723346341870662, + "loss": 0.81712866, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.17224121, + "step": 2444, + "time_per_iteration": 2.740504741668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114197, + "balance_loss_mlp": 1.12406492, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.12204296392179416, + "language_loss": 0.86163437, + "learning_rate": 0.0005720263552599188, + "loss": 0.87305409, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.17907715, + "step": 2445, + "time_per_iteration": 2.489807367324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112646, + "balance_loss_mlp": 1.10888886, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.08439630255123334, + "language_loss": 0.79720879, + "learning_rate": 0.0005717180483696604, + "loss": 0.80847341, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.17590332, + "step": 2446, + "time_per_iteration": 2.9626049995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113574, + "balance_loss_mlp": 1.09573984, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.0764291785045912, + "language_loss": 0.83012414, + "learning_rate": 0.0005714097136359862, + "loss": 0.84125984, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.17822266, + "step": 2447, + "time_per_iteration": 2.6736068725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105721, + "balance_loss_mlp": 1.08789945, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.08513203657143086, + "language_loss": 0.86345923, + "learning_rate": 0.0005711013511786027, + "loss": 0.87451649, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.1784668, + "step": 2448, + "time_per_iteration": 2.7899038791656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096309, + "balance_loss_mlp": 1.07914329, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.06769719009727464, + "language_loss": 0.83320636, + "learning_rate": 0.0005707929611172263, + "loss": 0.8441695, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.171875, + "step": 2449, + "time_per_iteration": 2.7302591800689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094917, + "balance_loss_mlp": 1.07738137, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.0952592580133139, + "language_loss": 0.83792615, + "learning_rate": 0.000570484543571585, + "loss": 0.84887528, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.17553711, + "step": 2450, + "time_per_iteration": 2.553699254989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091615, + "balance_loss_mlp": 1.07405567, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.09253179962645706, + "language_loss": 0.82604945, + "learning_rate": 0.0005701760986614171, + "loss": 0.83696556, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.17578125, + "step": 2451, + "time_per_iteration": 2.5708320140838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084195, + "balance_loss_mlp": 1.06648016, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.09280751659958478, + "language_loss": 0.87434494, + "learning_rate": 0.0005698676265064714, + "loss": 0.88518691, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.17736816, + "step": 2452, + "time_per_iteration": 2.505521297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.06540704, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.08307061480415358, + "language_loss": 0.88798922, + "learning_rate": 0.0005695591272265074, + "loss": 0.89882344, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.18017578, + "step": 2453, + "time_per_iteration": 2.5634660720825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091392, + "balance_loss_mlp": 1.07360613, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.09129518334944925, + "language_loss": 0.81819969, + "learning_rate": 0.0005692506009412954, + "loss": 0.8291136, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.17797852, + "step": 2454, + "time_per_iteration": 2.740715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094643, + "balance_loss_mlp": 1.08458209, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.045004720534391626, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78645909, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.10058594, + "step": 2455, + "time_per_iteration": 4.978295803070068 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110787, + "balance_loss_mlp": 1.08972645, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07943806135548723, + "language_loss": 0.89481127, + "learning_rate": 0.0005686334678342593, + "loss": 0.90588999, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.18151855, + "step": 2456, + "time_per_iteration": 2.9444401264190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124124, + "balance_loss_mlp": 1.10643291, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.08486852653668125, + "language_loss": 0.81272578, + "learning_rate": 0.0005683248612520274, + "loss": 0.8239671, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.17700195, + "step": 2457, + "time_per_iteration": 3.1061813831329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113744, + "balance_loss_mlp": 1.11931992, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.11516736159890015, + "language_loss": 0.83477956, + "learning_rate": 0.0005680162281437321, + "loss": 0.84615391, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.18115234, + "step": 2458, + "time_per_iteration": 2.929063558578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148235, + "balance_loss_mlp": 1.13042545, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.07751254840004482, + "language_loss": 0.84309924, + "learning_rate": 0.000567707568629195, + "loss": 0.85458159, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.17810059, + "step": 2459, + "time_per_iteration": 2.7221994400024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.12910485, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.08725044616859287, + "language_loss": 0.81842762, + "learning_rate": 0.0005673988828282486, + "loss": 0.82989782, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.17932129, + "step": 2460, + "time_per_iteration": 2.7002882957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137284, + "balance_loss_mlp": 1.11850882, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.08215342810100013, + "language_loss": 0.80515504, + "learning_rate": 0.0005670901708607352, + "loss": 0.8165279, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.1875, + "step": 2461, + "time_per_iteration": 2.9950685501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118533, + "balance_loss_mlp": 1.09990108, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.10884730986404606, + "language_loss": 0.83628744, + "learning_rate": 0.0005667814328465076, + "loss": 0.84747279, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.1862793, + "step": 2462, + "time_per_iteration": 2.645465612411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.09035087, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.09091581525952792, + "language_loss": 0.81654978, + "learning_rate": 0.0005664726689054285, + "loss": 0.82763606, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.18261719, + "step": 2463, + "time_per_iteration": 2.4545066356658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104041, + "balance_loss_mlp": 1.08579004, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.07864824239143242, + "language_loss": 0.80990708, + "learning_rate": 0.0005661638791573704, + "loss": 0.82094747, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.18237305, + "step": 2464, + "time_per_iteration": 2.734745502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108445, + "balance_loss_mlp": 1.08969331, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.0786760499807007, + "language_loss": 0.86728454, + "learning_rate": 0.0005658550637222164, + "loss": 0.87836903, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.1875, + "step": 2465, + "time_per_iteration": 2.6243197917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098109, + "balance_loss_mlp": 1.07942867, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.07656108123336647, + "language_loss": 0.82025492, + "learning_rate": 0.0005655462227198592, + "loss": 0.831236, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.18676758, + "step": 2466, + "time_per_iteration": 2.9340949058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090686, + "balance_loss_mlp": 1.0713619, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.08929128939464244, + "language_loss": 0.84165299, + "learning_rate": 0.0005652373562702016, + "loss": 0.8525598, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.19311523, + "step": 2467, + "time_per_iteration": 2.6669704914093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088642, + "balance_loss_mlp": 1.07042646, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.09740211929478898, + "language_loss": 0.88243479, + "learning_rate": 0.000564928464493156, + "loss": 0.89332116, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.18212891, + "step": 2468, + "time_per_iteration": 2.5501999855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_mlp": 1.06571448, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.10206964777214489, + "language_loss": 0.8130033, + "learning_rate": 0.000564619547508645, + "loss": 0.82383919, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.17907715, + "step": 2469, + "time_per_iteration": 3.1110846996307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080615, + "balance_loss_mlp": 1.0618155, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.10847559686300064, + "language_loss": 0.83074248, + "learning_rate": 0.0005643106054366008, + "loss": 0.84154862, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.18798828, + "step": 2470, + "time_per_iteration": 2.5955324172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082834, + "balance_loss_mlp": 1.0653584, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.07776069310312227, + "language_loss": 0.78943384, + "learning_rate": 0.000564001638396965, + "loss": 0.80026221, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.17492676, + "step": 2471, + "time_per_iteration": 2.7306296825408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090328, + "balance_loss_mlp": 1.07253027, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.0797482134953605, + "language_loss": 0.81547666, + "learning_rate": 0.0005636926465096897, + "loss": 0.8263799, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.17810059, + "step": 2472, + "time_per_iteration": 3.059279203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112, + "balance_loss_mlp": 1.09371316, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.08460495515925144, + "language_loss": 0.87285447, + "learning_rate": 0.0005633836298947363, + "loss": 0.88397449, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.18286133, + "step": 2473, + "time_per_iteration": 2.6521553993225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122658, + "balance_loss_mlp": 1.10413289, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.09203669339342216, + "language_loss": 0.70590854, + "learning_rate": 0.000563074588672075, + "loss": 0.71713507, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.18530273, + "step": 2474, + "time_per_iteration": 2.7375221252441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113642, + "balance_loss_mlp": 1.11839581, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.0857314817059495, + "language_loss": 0.8500272, + "learning_rate": 0.0005627655229616868, + "loss": 0.86139143, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.18029785, + "step": 2475, + "time_per_iteration": 2.7078299522399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128625, + "balance_loss_mlp": 1.11030293, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.07963853645873449, + "language_loss": 0.89927155, + "learning_rate": 0.0005624564328835616, + "loss": 0.91055775, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.18334961, + "step": 2476, + "time_per_iteration": 2.8388264179229736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117766, + "balance_loss_mlp": 1.09916914, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.07471116365669703, + "language_loss": 0.83945388, + "learning_rate": 0.0005621473185576986, + "loss": 0.85063154, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.18579102, + "step": 2477, + "time_per_iteration": 2.7755634784698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.09451878, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.10765434361010802, + "language_loss": 0.87517297, + "learning_rate": 0.0005618381801041068, + "loss": 0.88629925, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.18115234, + "step": 2478, + "time_per_iteration": 2.6078171730041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110924, + "balance_loss_mlp": 1.0912751, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.09054531696498577, + "language_loss": 0.8286736, + "learning_rate": 0.0005615290176428044, + "loss": 0.83976603, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.17980957, + "step": 2479, + "time_per_iteration": 2.658313035964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093873, + "balance_loss_mlp": 1.07611132, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.07218164617984826, + "language_loss": 0.85039639, + "learning_rate": 0.0005612198312938187, + "loss": 0.8613351, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.17773438, + "step": 2480, + "time_per_iteration": 2.7423031330108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07839966, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.08183869789897112, + "language_loss": 0.79371572, + "learning_rate": 0.0005609106211771868, + "loss": 0.80467397, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.17443848, + "step": 2481, + "time_per_iteration": 2.888284921646118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098997, + "balance_loss_mlp": 1.08134174, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.07799032438633784, + "language_loss": 0.89138782, + "learning_rate": 0.0005606013874129543, + "loss": 0.90237772, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.17675781, + "step": 2482, + "time_per_iteration": 2.8308520317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096101, + "balance_loss_mlp": 1.07892263, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.06912495328146803, + "language_loss": 0.79914749, + "learning_rate": 0.0005602921301211768, + "loss": 0.81010854, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.17199707, + "step": 2483, + "time_per_iteration": 2.745229721069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092062, + "balance_loss_mlp": 1.07441866, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.08947954354315603, + "language_loss": 0.8218801, + "learning_rate": 0.0005599828494219185, + "loss": 0.83280063, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.1763916, + "step": 2484, + "time_per_iteration": 2.5549302101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.07945359, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.09532235552733567, + "language_loss": 0.8879438, + "learning_rate": 0.0005596735454352527, + "loss": 0.89891142, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.17333984, + "step": 2485, + "time_per_iteration": 2.8665127754211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094881, + "balance_loss_mlp": 1.07777441, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.09434748219243295, + "language_loss": 0.85316986, + "learning_rate": 0.0005593642182812619, + "loss": 0.8641187, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.17126465, + "step": 2486, + "time_per_iteration": 2.6778790950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094993, + "balance_loss_mlp": 1.07798147, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.07207308279854807, + "language_loss": 0.83091319, + "learning_rate": 0.0005590548680800378, + "loss": 0.84186316, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.17028809, + "step": 2487, + "time_per_iteration": 3.121678590774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100078, + "balance_loss_mlp": 1.08330488, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.0688175569320757, + "language_loss": 0.76333058, + "learning_rate": 0.0005587454949516804, + "loss": 0.77433127, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.16784668, + "step": 2488, + "time_per_iteration": 2.7487144470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109664, + "balance_loss_mlp": 1.09223557, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.0791895688664035, + "language_loss": 0.87661278, + "learning_rate": 0.0005584360990162993, + "loss": 0.88770944, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.17443848, + "step": 2489, + "time_per_iteration": 2.6889615058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.08878708, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.06381910852284944, + "language_loss": 0.85160542, + "learning_rate": 0.0005581266803940124, + "loss": 0.8626619, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.16870117, + "step": 2490, + "time_per_iteration": 2.752704381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108064, + "balance_loss_mlp": 1.09077895, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.06997425176776657, + "language_loss": 0.87046134, + "learning_rate": 0.0005578172392049471, + "loss": 0.88154197, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.17297363, + "step": 2491, + "time_per_iteration": 2.744326114654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113808, + "balance_loss_mlp": 1.09704673, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.0919919864780235, + "language_loss": 0.84245729, + "learning_rate": 0.0005575077755692386, + "loss": 0.85359544, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.16760254, + "step": 2492, + "time_per_iteration": 2.829349994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106621, + "balance_loss_mlp": 1.08978891, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.07193820952165939, + "language_loss": 0.85866803, + "learning_rate": 0.0005571982896070316, + "loss": 0.86973423, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.16845703, + "step": 2493, + "time_per_iteration": 2.6917920112609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111679, + "balance_loss_mlp": 1.09457207, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.08033850408937983, + "language_loss": 0.89604986, + "learning_rate": 0.0005568887814384792, + "loss": 0.9071666, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.17114258, + "step": 2494, + "time_per_iteration": 2.569196939468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106396, + "balance_loss_mlp": 1.08963561, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.07662616215624289, + "language_loss": 0.87274265, + "learning_rate": 0.000556579251183743, + "loss": 0.88380659, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.16772461, + "step": 2495, + "time_per_iteration": 4.119016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.09276271, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.07795098880988466, + "language_loss": 0.79870969, + "learning_rate": 0.0005562696989629936, + "loss": 0.80980641, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.16918945, + "step": 2496, + "time_per_iteration": 2.780027151107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112455, + "balance_loss_mlp": 1.09557533, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.068284016634177, + "language_loss": 0.82789242, + "learning_rate": 0.0005559601248964095, + "loss": 0.83901697, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.16894531, + "step": 2497, + "time_per_iteration": 2.653590202331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110865, + "balance_loss_mlp": 1.09190154, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.10697304585744172, + "language_loss": 0.85506153, + "learning_rate": 0.0005556505291041783, + "loss": 0.86614799, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.16760254, + "step": 2498, + "time_per_iteration": 2.720294952392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106549, + "balance_loss_mlp": 1.08972836, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.0621998173583794, + "language_loss": 0.84237647, + "learning_rate": 0.0005553409117064954, + "loss": 0.85344195, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.16833496, + "step": 2499, + "time_per_iteration": 2.9154043197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119945, + "balance_loss_mlp": 1.10298109, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.07282479458874046, + "language_loss": 0.84656966, + "learning_rate": 0.0005550312728235654, + "loss": 0.85776907, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.16967773, + "step": 2500, + "time_per_iteration": 2.700421094894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110812, + "balance_loss_mlp": 1.09159744, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.08404220746537734, + "language_loss": 0.83821297, + "learning_rate": 0.0005547216125756003, + "loss": 0.84929419, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.1652832, + "step": 2501, + "time_per_iteration": 2.7834067344665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106276, + "balance_loss_mlp": 1.08955085, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.07639679647694927, + "language_loss": 0.81906044, + "learning_rate": 0.0005544119310828211, + "loss": 0.83012319, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.16723633, + "step": 2502, + "time_per_iteration": 3.116422414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.09020913, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.07431223188319182, + "language_loss": 0.84573793, + "learning_rate": 0.0005541022284654568, + "loss": 0.85680836, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.16845703, + "step": 2503, + "time_per_iteration": 2.9265871047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110287, + "balance_loss_mlp": 1.08615696, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.06355297884535237, + "language_loss": 0.83910048, + "learning_rate": 0.0005537925048437446, + "loss": 0.85012925, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.16723633, + "step": 2504, + "time_per_iteration": 2.6517508029937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087379, + "balance_loss_mlp": 1.07774711, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.041815183909307344, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76838851, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.09619141, + "step": 2505, + "time_per_iteration": 4.958322048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105183, + "balance_loss_mlp": 1.08836293, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.060666396845578126, + "language_loss": 0.88195586, + "learning_rate": 0.0005531729950682664, + "loss": 0.8930077, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.16833496, + "step": 2506, + "time_per_iteration": 3.0288734436035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_mlp": 1.08631384, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.10090208417938805, + "language_loss": 0.84562349, + "learning_rate": 0.000552863209155015, + "loss": 0.85666019, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.17382812, + "step": 2507, + "time_per_iteration": 2.503030300140381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104399, + "balance_loss_mlp": 1.0873642, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.0644343170841742, + "language_loss": 0.82010555, + "learning_rate": 0.0005525534027184461, + "loss": 0.83114958, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.17053223, + "step": 2508, + "time_per_iteration": 2.563375949859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115093, + "balance_loss_mlp": 1.09834397, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.20306769309253048, + "language_loss": 0.82742786, + "learning_rate": 0.0005522435758788365, + "loss": 0.83857882, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.16760254, + "step": 2509, + "time_per_iteration": 2.773317813873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107185, + "balance_loss_mlp": 1.08974481, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.08084829795782655, + "language_loss": 0.80297685, + "learning_rate": 0.0005519337287564721, + "loss": 0.81404877, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.17468262, + "step": 2510, + "time_per_iteration": 2.8417367935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109893, + "balance_loss_mlp": 1.09273911, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07005467856459312, + "language_loss": 0.83318454, + "learning_rate": 0.000551623861471646, + "loss": 0.84428346, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.17175293, + "step": 2511, + "time_per_iteration": 4.144210577011108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_mlp": 1.02186131, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.022823457387693702, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79850423, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.09716797, + "step": 2512, + "time_per_iteration": 4.846112489700317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105484, + "balance_loss_mlp": 1.08805561, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.06582055063949785, + "language_loss": 0.86307418, + "learning_rate": 0.0005510040668958211, + "loss": 0.87412906, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.17443848, + "step": 2513, + "time_per_iteration": 2.5893678665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_mlp": 1.01802599, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.018178820637651416, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78788525, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.09912109, + "step": 2514, + "time_per_iteration": 4.883544445037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104231, + "balance_loss_mlp": 1.08638501, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.07451301520475437, + "language_loss": 0.83174801, + "learning_rate": 0.0005503841931138645, + "loss": 0.84279031, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.17858887, + "step": 2515, + "time_per_iteration": 2.6821184158325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099055, + "balance_loss_mlp": 1.0817579, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.1026377711865236, + "language_loss": 0.81650221, + "learning_rate": 0.0005500742268214025, + "loss": 0.82749277, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.17321777, + "step": 2516, + "time_per_iteration": 2.501392364501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094696, + "balance_loss_mlp": 1.07677877, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.06104395933883966, + "language_loss": 0.85527956, + "learning_rate": 0.0005497642410884014, + "loss": 0.86622655, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.17919922, + "step": 2517, + "time_per_iteration": 2.7879879474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092849, + "balance_loss_mlp": 1.07494426, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.0763804859448823, + "language_loss": 0.85418707, + "learning_rate": 0.0005494542360352085, + "loss": 0.86511558, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.17919922, + "step": 2518, + "time_per_iteration": 2.705934762954712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098599, + "balance_loss_mlp": 1.0811708, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.07348525281964927, + "language_loss": 0.855097, + "learning_rate": 0.0005491442117821783, + "loss": 0.86608291, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.17456055, + "step": 2519, + "time_per_iteration": 2.7056097984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097149, + "balance_loss_mlp": 1.07910061, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.07963371062569355, + "language_loss": 0.87741303, + "learning_rate": 0.0005488341684496732, + "loss": 0.88838446, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.18054199, + "step": 2520, + "time_per_iteration": 2.6991913318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107269, + "balance_loss_mlp": 1.08979297, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06522694836378315, + "language_loss": 0.91749704, + "learning_rate": 0.0005485241061580624, + "loss": 0.92856967, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.17480469, + "step": 2521, + "time_per_iteration": 2.751336097717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111149, + "balance_loss_mlp": 1.09335089, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.0788581364531382, + "language_loss": 0.84810591, + "learning_rate": 0.0005482140250277228, + "loss": 0.85921741, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.17797852, + "step": 2522, + "time_per_iteration": 3.012603759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116154, + "balance_loss_mlp": 1.09896421, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.081531881919659, + "language_loss": 0.87781787, + "learning_rate": 0.0005479039251790387, + "loss": 0.88897943, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.17211914, + "step": 2523, + "time_per_iteration": 2.6643292903900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115817, + "balance_loss_mlp": 1.0985198, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.1008566510750689, + "language_loss": 0.84847081, + "learning_rate": 0.0005475938067324014, + "loss": 0.85962898, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.1730957, + "step": 2524, + "time_per_iteration": 2.8631820678710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129536, + "balance_loss_mlp": 1.11252499, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.08592622698203999, + "language_loss": 0.83456719, + "learning_rate": 0.0005472836698082098, + "loss": 0.84586251, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.17028809, + "step": 2525, + "time_per_iteration": 2.5364460945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109588, + "balance_loss_mlp": 1.09244525, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.06952957834620052, + "language_loss": 0.8412683, + "learning_rate": 0.0005469735145268694, + "loss": 0.85236418, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.17138672, + "step": 2526, + "time_per_iteration": 2.766571283340454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106022, + "balance_loss_mlp": 1.08884394, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.07975413334667165, + "language_loss": 0.80809188, + "learning_rate": 0.0005466633410087933, + "loss": 0.81915212, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.171875, + "step": 2527, + "time_per_iteration": 2.738344192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072106, + "balance_loss_mlp": 1.06094766, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.03644390169401177, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78332925, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.11181641, + "step": 2528, + "time_per_iteration": 4.871282339096069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090172, + "balance_loss_mlp": 1.07268429, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.06987485087243678, + "language_loss": 0.87962806, + "learning_rate": 0.0005460429397441214, + "loss": 0.89052981, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.1751709, + "step": 2529, + "time_per_iteration": 2.589794635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097683, + "balance_loss_mlp": 1.08112478, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.08125917870845005, + "language_loss": 0.86507833, + "learning_rate": 0.0005457327122383866, + "loss": 0.87605512, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.16564941, + "step": 2530, + "time_per_iteration": 2.633769989013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024086, + "balance_loss_mlp": 1.01402473, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.019350247330642424, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75660574, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.10058594, + "step": 2531, + "time_per_iteration": 4.829160213470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_mlp": 1.09450376, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.07679109022151961, + "language_loss": 0.7589134, + "learning_rate": 0.0005451122040823244, + "loss": 0.77002603, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.16760254, + "step": 2532, + "time_per_iteration": 2.809295654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113381, + "balance_loss_mlp": 1.09582114, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.07652021477742418, + "language_loss": 0.76977062, + "learning_rate": 0.0005448019236728997, + "loss": 0.78090441, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.17565918, + "step": 2533, + "time_per_iteration": 2.889730930328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111676, + "balance_loss_mlp": 1.09540379, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.08912362185496442, + "language_loss": 0.84908152, + "learning_rate": 0.0005444916258698255, + "loss": 0.86019826, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.16271973, + "step": 2534, + "time_per_iteration": 2.6680796146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109562, + "balance_loss_mlp": 1.09297991, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.06587099405348051, + "language_loss": 0.85898745, + "learning_rate": 0.0005441813107935704, + "loss": 0.87008309, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.16589355, + "step": 2535, + "time_per_iteration": 2.708963394165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121617, + "balance_loss_mlp": 1.10494018, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.07506618076199813, + "language_loss": 0.856264, + "learning_rate": 0.0005438709785646091, + "loss": 0.86748016, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.16687012, + "step": 2536, + "time_per_iteration": 2.5794246196746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.0970813, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.06872348733444625, + "language_loss": 0.86540043, + "learning_rate": 0.0005435606293034234, + "loss": 0.87653565, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.16442871, + "step": 2537, + "time_per_iteration": 2.663050889968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116159, + "balance_loss_mlp": 1.0999465, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.09164692396838796, + "language_loss": 0.84696114, + "learning_rate": 0.0005432502631305016, + "loss": 0.85812277, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.16210938, + "step": 2538, + "time_per_iteration": 2.7034809589385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119353, + "balance_loss_mlp": 1.10295033, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.06227186407680876, + "language_loss": 0.82968855, + "learning_rate": 0.0005429398801663386, + "loss": 0.84088206, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.1640625, + "step": 2539, + "time_per_iteration": 3.0155930519104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120209, + "balance_loss_mlp": 1.10398471, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.10714048411465311, + "language_loss": 0.82757926, + "learning_rate": 0.0005426294805314355, + "loss": 0.83878136, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.16223145, + "step": 2540, + "time_per_iteration": 2.5441384315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115337, + "balance_loss_mlp": 1.09914827, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.08648554978838247, + "language_loss": 0.79954243, + "learning_rate": 0.0005423190643463003, + "loss": 0.81069577, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.16186523, + "step": 2541, + "time_per_iteration": 2.992694854736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112211, + "balance_loss_mlp": 1.0954504, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.08541624697499144, + "language_loss": 0.82913029, + "learning_rate": 0.0005420086317314473, + "loss": 0.84025246, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.16772461, + "step": 2542, + "time_per_iteration": 2.658069133758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104299, + "balance_loss_mlp": 1.08720386, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.06935244738816776, + "language_loss": 0.80814946, + "learning_rate": 0.0005416981828073971, + "loss": 0.81919247, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.17102051, + "step": 2543, + "time_per_iteration": 2.818812608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_mlp": 1.02991831, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.020152649211275964, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78154421, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.09472656, + "step": 2544, + "time_per_iteration": 4.891278028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100982, + "balance_loss_mlp": 1.08363652, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.07927159683050183, + "language_loss": 0.85168952, + "learning_rate": 0.000541077236513819, + "loss": 0.86269933, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.17346191, + "step": 2545, + "time_per_iteration": 2.589184045791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094215, + "balance_loss_mlp": 1.07689393, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.06748793045052295, + "language_loss": 0.82038838, + "learning_rate": 0.0005407667393853638, + "loss": 0.83133048, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.17333984, + "step": 2546, + "time_per_iteration": 2.6306400299072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099707, + "balance_loss_mlp": 1.08196878, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.08073962926855084, + "language_loss": 0.83248717, + "learning_rate": 0.0005404562264298569, + "loss": 0.84348422, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.17749023, + "step": 2547, + "time_per_iteration": 2.890744209289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097618, + "balance_loss_mlp": 1.0795579, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.07477586030938296, + "language_loss": 0.83869213, + "learning_rate": 0.0005401456977678498, + "loss": 0.84966832, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.18078613, + "step": 2548, + "time_per_iteration": 2.691488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093416, + "balance_loss_mlp": 1.0753082, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.08381067722766777, + "language_loss": 0.77390134, + "learning_rate": 0.0005398351535199008, + "loss": 0.78483546, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.18103027, + "step": 2549, + "time_per_iteration": 3.0651490688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087007, + "balance_loss_mlp": 1.06931591, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.05957811074119609, + "language_loss": 0.83473563, + "learning_rate": 0.0005395245938065735, + "loss": 0.84560567, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.17712402, + "step": 2550, + "time_per_iteration": 2.7947916984558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085104, + "balance_loss_mlp": 1.06648386, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.10016911025461137, + "language_loss": 0.82528293, + "learning_rate": 0.0005392140187484379, + "loss": 0.83613402, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.18603516, + "step": 2551, + "time_per_iteration": 2.6254496574401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089241, + "balance_loss_mlp": 1.0698818, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.05979290752357133, + "language_loss": 0.89496678, + "learning_rate": 0.0005389034284660701, + "loss": 0.90585923, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.19348145, + "step": 2552, + "time_per_iteration": 2.8202950954437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096651, + "balance_loss_mlp": 1.07798314, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.09877873271676557, + "language_loss": 0.82097638, + "learning_rate": 0.000538592823080052, + "loss": 0.83194292, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.18676758, + "step": 2553, + "time_per_iteration": 3.156975507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092631, + "balance_loss_mlp": 1.07395101, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.1092160541841064, + "language_loss": 0.84523845, + "learning_rate": 0.000538282202710971, + "loss": 0.85616469, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.18664551, + "step": 2554, + "time_per_iteration": 2.5290331840515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109959, + "balance_loss_mlp": 1.08045673, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.10555847882945492, + "language_loss": 0.82219321, + "learning_rate": 0.000537971567479421, + "loss": 0.83318907, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.19128418, + "step": 2555, + "time_per_iteration": 2.755554437637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094808, + "balance_loss_mlp": 1.07547224, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.0816634604134734, + "language_loss": 0.87386465, + "learning_rate": 0.0005376609175060011, + "loss": 0.88481277, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.19311523, + "step": 2556, + "time_per_iteration": 2.6251890659332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088346, + "balance_loss_mlp": 1.06941605, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.1007754916439506, + "language_loss": 0.80408537, + "learning_rate": 0.0005373502529113162, + "loss": 0.81496882, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.18920898, + "step": 2557, + "time_per_iteration": 2.8081767559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080378, + "balance_loss_mlp": 1.06081533, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.09200682846254944, + "language_loss": 0.81391776, + "learning_rate": 0.0005370395738159773, + "loss": 0.82472152, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.19543457, + "step": 2558, + "time_per_iteration": 2.6609818935394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084764, + "balance_loss_mlp": 1.06559491, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.08064506015832804, + "language_loss": 0.82711154, + "learning_rate": 0.0005367288803406003, + "loss": 0.83795917, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.19165039, + "step": 2559, + "time_per_iteration": 2.644026756286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084251, + "balance_loss_mlp": 1.06544018, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.0889068964261426, + "language_loss": 0.81602907, + "learning_rate": 0.0005364181726058073, + "loss": 0.82687151, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.18798828, + "step": 2560, + "time_per_iteration": 2.7356274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082609, + "balance_loss_mlp": 1.06403637, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.0950227496854857, + "language_loss": 0.82278556, + "learning_rate": 0.0005361074507322261, + "loss": 0.83361161, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.18566895, + "step": 2561, + "time_per_iteration": 2.663046360015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06827641, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.07772582275378431, + "language_loss": 0.81617248, + "learning_rate": 0.000535796714840489, + "loss": 0.82704192, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.18664551, + "step": 2562, + "time_per_iteration": 2.638414144515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.07574439, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.08606941059340069, + "language_loss": 0.83548921, + "learning_rate": 0.0005354859650512348, + "loss": 0.84643233, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.18566895, + "step": 2563, + "time_per_iteration": 2.786123752593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103932, + "balance_loss_mlp": 1.08636093, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.10665890037430359, + "language_loss": 0.87337875, + "learning_rate": 0.0005351752014851074, + "loss": 0.88441813, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.17578125, + "step": 2564, + "time_per_iteration": 2.5858397483825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110675, + "balance_loss_mlp": 1.08847523, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.10057993561194663, + "language_loss": 0.83317149, + "learning_rate": 0.0005348644242627553, + "loss": 0.844239, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.1829834, + "step": 2565, + "time_per_iteration": 2.7638742923736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050217, + "balance_loss_mlp": 1.04082322, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.03479988729177956, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76336837, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.09375, + "step": 2566, + "time_per_iteration": 4.947393417358398 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106718, + "balance_loss_mlp": 1.08951592, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.06927642597141821, + "language_loss": 0.81322002, + "learning_rate": 0.0005342428293320013, + "loss": 0.82428724, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.17199707, + "step": 2567, + "time_per_iteration": 2.778985023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104881, + "balance_loss_mlp": 1.08785808, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.07155621127563581, + "language_loss": 0.83412832, + "learning_rate": 0.0005339320118649238, + "loss": 0.84517711, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.17041016, + "step": 2568, + "time_per_iteration": 2.7361106872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118929, + "balance_loss_mlp": 1.10148847, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.06786367407396048, + "language_loss": 0.86708534, + "learning_rate": 0.000533621181224271, + "loss": 0.87827462, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.17443848, + "step": 2569, + "time_per_iteration": 2.8056747913360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113987, + "balance_loss_mlp": 1.09679675, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.08062562134183447, + "language_loss": 0.81321245, + "learning_rate": 0.0005333103375307182, + "loss": 0.82435232, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.17211914, + "step": 2570, + "time_per_iteration": 2.904440402984619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114683, + "balance_loss_mlp": 1.09786248, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06756621210058887, + "language_loss": 0.8584491, + "learning_rate": 0.0005329994809049451, + "loss": 0.86959589, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.16833496, + "step": 2571, + "time_per_iteration": 2.8053295612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131581, + "balance_loss_mlp": 1.11458206, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.09358938815201079, + "language_loss": 0.87904042, + "learning_rate": 0.0005326886114676375, + "loss": 0.89035624, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.17016602, + "step": 2572, + "time_per_iteration": 2.8100666999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113844, + "balance_loss_mlp": 1.09724987, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06954374103744322, + "language_loss": 0.87645632, + "learning_rate": 0.0005323777293394854, + "loss": 0.88759476, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.16601562, + "step": 2573, + "time_per_iteration": 2.6342670917510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112174, + "balance_loss_mlp": 1.09544909, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.06551139751330846, + "language_loss": 0.82055044, + "learning_rate": 0.000532066834641184, + "loss": 0.83167219, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.1673584, + "step": 2574, + "time_per_iteration": 2.7459301948547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115153, + "balance_loss_mlp": 1.09861851, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.07271172156944823, + "language_loss": 0.85062492, + "learning_rate": 0.0005317559274934334, + "loss": 0.86177647, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.16540527, + "step": 2575, + "time_per_iteration": 2.79950213432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109887, + "balance_loss_mlp": 1.0929718, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.12491917898667039, + "language_loss": 0.80294836, + "learning_rate": 0.0005314450080169382, + "loss": 0.81404722, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.16931152, + "step": 2576, + "time_per_iteration": 2.646117687225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111519, + "balance_loss_mlp": 1.09459102, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.06948953090692808, + "language_loss": 0.80618382, + "learning_rate": 0.0005311340763324083, + "loss": 0.81729901, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.16931152, + "step": 2577, + "time_per_iteration": 2.637355327606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115745, + "balance_loss_mlp": 1.09885335, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.06343391975743103, + "language_loss": 0.82572562, + "learning_rate": 0.0005308231325605578, + "loss": 0.83688301, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.16906738, + "step": 2578, + "time_per_iteration": 2.7532670497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10721767, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.06763129936720796, + "language_loss": 0.76589197, + "learning_rate": 0.0005305121768221061, + "loss": 0.77713311, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.16906738, + "step": 2579, + "time_per_iteration": 3.099548816680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106106, + "balance_loss_mlp": 1.09718919, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.03611799224355641, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76144433, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.08935547, + "step": 2580, + "time_per_iteration": 4.822290658950806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112299, + "balance_loss_mlp": 1.0955143, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.07683784808208224, + "language_loss": 0.91874099, + "learning_rate": 0.0005298902299282984, + "loss": 0.92986393, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.16796875, + "step": 2581, + "time_per_iteration": 2.6493284702301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117923, + "balance_loss_mlp": 1.10141301, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.09118838704679054, + "language_loss": 0.84425116, + "learning_rate": 0.0005295792390144033, + "loss": 0.85543042, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.16516113, + "step": 2582, + "time_per_iteration": 2.8000099658966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121007, + "balance_loss_mlp": 1.1042583, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.08989559260345804, + "language_loss": 0.83660305, + "learning_rate": 0.0005292682366168294, + "loss": 0.84781313, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.16760254, + "step": 2583, + "time_per_iteration": 2.573913812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116101, + "balance_loss_mlp": 1.0993638, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.07863246165846992, + "language_loss": 0.79766655, + "learning_rate": 0.0005289572228563181, + "loss": 0.80882752, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.16748047, + "step": 2584, + "time_per_iteration": 2.807269811630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114321, + "balance_loss_mlp": 1.09676123, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.06809186764850061, + "language_loss": 0.8288846, + "learning_rate": 0.000528646197853616, + "loss": 0.84002781, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.17578125, + "step": 2585, + "time_per_iteration": 2.806168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114035, + "balance_loss_mlp": 1.09709597, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.06908816819532054, + "language_loss": 0.85582453, + "learning_rate": 0.0005283351617294735, + "loss": 0.86696494, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.16943359, + "step": 2586, + "time_per_iteration": 2.926912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_mlp": 1.02630937, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.01596603428611825, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77671409, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.08447266, + "step": 2587, + "time_per_iteration": 5.0390965938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107376, + "balance_loss_mlp": 1.08937573, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.06339397332392985, + "language_loss": 0.86461538, + "learning_rate": 0.0005277130565998916, + "loss": 0.87568915, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.18005371, + "step": 2588, + "time_per_iteration": 2.770092248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116474, + "balance_loss_mlp": 1.09942722, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.058229952595652015, + "language_loss": 0.81859887, + "learning_rate": 0.0005274019878359748, + "loss": 0.82976359, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.17053223, + "step": 2589, + "time_per_iteration": 2.7338075637817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114654, + "balance_loss_mlp": 1.09733331, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.09126406549336552, + "language_loss": 0.86714995, + "learning_rate": 0.0005270909084336628, + "loss": 0.87829649, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.17333984, + "step": 2590, + "time_per_iteration": 2.65108323097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116441, + "balance_loss_mlp": 1.09858298, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.1060624554819127, + "language_loss": 0.88702905, + "learning_rate": 0.0005267798185137276, + "loss": 0.89819348, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.17871094, + "step": 2591, + "time_per_iteration": 2.6553287506103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105559, + "balance_loss_mlp": 1.08758211, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.13093350294478928, + "language_loss": 0.88770413, + "learning_rate": 0.0005264687181969444, + "loss": 0.89875972, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.17980957, + "step": 2592, + "time_per_iteration": 2.7969043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110929, + "balance_loss_mlp": 1.0928092, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.07529154121690083, + "language_loss": 0.74930251, + "learning_rate": 0.0005261576076040937, + "loss": 0.76041174, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.18127441, + "step": 2593, + "time_per_iteration": 3.3571712970733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101158, + "balance_loss_mlp": 1.08368254, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.07032432999454871, + "language_loss": 0.83977568, + "learning_rate": 0.0005258464868559591, + "loss": 0.85078728, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.17492676, + "step": 2594, + "time_per_iteration": 2.691549301147461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102198, + "balance_loss_mlp": 1.08469868, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.06016242034808734, + "language_loss": 0.88749588, + "learning_rate": 0.0005255353560733284, + "loss": 0.89851785, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.17529297, + "step": 2595, + "time_per_iteration": 2.643775701522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074264, + "balance_loss_mlp": 1.0654906, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.03161132267250996, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76652908, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.08789062, + "step": 2596, + "time_per_iteration": 4.8261682987213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.08255887, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.06872371897226848, + "language_loss": 0.83470559, + "learning_rate": 0.0005249130648877492, + "loss": 0.84571064, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.17956543, + "step": 2597, + "time_per_iteration": 2.793973445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099762, + "balance_loss_mlp": 1.08096313, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.07739235171207769, + "language_loss": 0.84593171, + "learning_rate": 0.0005246019047263953, + "loss": 0.8569293, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.18798828, + "step": 2598, + "time_per_iteration": 2.5284597873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_mlp": 1.08447933, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.0766017052589062, + "language_loss": 0.82300264, + "learning_rate": 0.0005242907350137353, + "loss": 0.83403295, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.18554688, + "step": 2599, + "time_per_iteration": 2.57824969291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102331, + "balance_loss_mlp": 1.08466387, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.07109220242790512, + "language_loss": 0.78955519, + "learning_rate": 0.0005239795558705754, + "loss": 0.80057847, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.17675781, + "step": 2600, + "time_per_iteration": 2.735712766647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093225, + "balance_loss_mlp": 1.07491398, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.0850656909263446, + "language_loss": 0.89518678, + "learning_rate": 0.0005236683674177264, + "loss": 0.90611899, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.18310547, + "step": 2601, + "time_per_iteration": 2.7013046741485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101472, + "balance_loss_mlp": 1.08336401, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.06829559635091415, + "language_loss": 0.82179487, + "learning_rate": 0.0005233571697760021, + "loss": 0.83280951, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.18103027, + "step": 2602, + "time_per_iteration": 2.902503490447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101813, + "balance_loss_mlp": 1.08420539, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.10152220944898022, + "language_loss": 0.82961535, + "learning_rate": 0.0005230459630662203, + "loss": 0.84063351, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.17626953, + "step": 2603, + "time_per_iteration": 2.966848134994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.09103274, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.07939636618021073, + "language_loss": 0.8145076, + "learning_rate": 0.0005227347474092022, + "loss": 0.82559389, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.17602539, + "step": 2604, + "time_per_iteration": 2.76577091217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107422, + "balance_loss_mlp": 1.08948135, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.06357584490296206, + "language_loss": 0.82990885, + "learning_rate": 0.0005224235229257724, + "loss": 0.84098309, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.17956543, + "step": 2605, + "time_per_iteration": 2.798074245452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108194, + "balance_loss_mlp": 1.09092093, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.059877769950401664, + "language_loss": 0.86506116, + "learning_rate": 0.0005221122897367589, + "loss": 0.8761431, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.17285156, + "step": 2606, + "time_per_iteration": 2.8442416191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120744, + "balance_loss_mlp": 1.10386384, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.08858636737693353, + "language_loss": 0.81257951, + "learning_rate": 0.0005218010479629932, + "loss": 0.82378697, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.16882324, + "step": 2607, + "time_per_iteration": 2.720196485519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112059, + "balance_loss_mlp": 1.09503603, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.09219088613115281, + "language_loss": 0.82021785, + "learning_rate": 0.0005214897977253102, + "loss": 0.83133841, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.17041016, + "step": 2608, + "time_per_iteration": 2.6824939250946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104466, + "balance_loss_mlp": 1.08703792, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.05892482680876805, + "language_loss": 0.84221715, + "learning_rate": 0.0005211785391445473, + "loss": 0.85326183, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.17456055, + "step": 2609, + "time_per_iteration": 2.72525954246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.08809578, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.07489132465153774, + "language_loss": 0.79042387, + "learning_rate": 0.0005208672723415467, + "loss": 0.80148035, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.17553711, + "step": 2610, + "time_per_iteration": 2.8028247356414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110106, + "balance_loss_mlp": 1.08385801, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.08294073768606391, + "language_loss": 0.7915107, + "learning_rate": 0.0005205559974371525, + "loss": 0.80252123, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.17211914, + "step": 2611, + "time_per_iteration": 2.7850143909454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094923, + "balance_loss_mlp": 1.07810235, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.07295315460395477, + "language_loss": 0.82193494, + "learning_rate": 0.0005202447145522123, + "loss": 0.83288413, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.16821289, + "step": 2612, + "time_per_iteration": 2.700307607650757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090548, + "balance_loss_mlp": 1.07344127, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.0792727031944949, + "language_loss": 0.79256612, + "learning_rate": 0.0005199334238075769, + "loss": 0.80347157, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.17126465, + "step": 2613, + "time_per_iteration": 2.6153087615966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089787, + "balance_loss_mlp": 1.07271576, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.08033639738386796, + "language_loss": 0.91661727, + "learning_rate": 0.0005196221253241, + "loss": 0.92751515, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.17089844, + "step": 2614, + "time_per_iteration": 2.6069750785827637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088826, + "balance_loss_mlp": 1.07155263, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.07969948054344475, + "language_loss": 0.82871294, + "learning_rate": 0.0005193108192226383, + "loss": 0.83960116, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.17272949, + "step": 2615, + "time_per_iteration": 2.8156328201293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084024, + "balance_loss_mlp": 1.06673825, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.06296322155163143, + "language_loss": 0.86797768, + "learning_rate": 0.000518999505624052, + "loss": 0.87881792, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.1730957, + "step": 2616, + "time_per_iteration": 2.7152223587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080227, + "balance_loss_mlp": 1.06292999, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.05958638296552923, + "language_loss": 0.83317488, + "learning_rate": 0.000518688184649203, + "loss": 0.84397715, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.17297363, + "step": 2617, + "time_per_iteration": 2.8284754753112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_mlp": 1.06272697, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.07368279711977406, + "language_loss": 0.83787394, + "learning_rate": 0.0005183768564189577, + "loss": 0.84867823, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.17724609, + "step": 2618, + "time_per_iteration": 2.591064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.06613898, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.08850035073541652, + "language_loss": 0.81363833, + "learning_rate": 0.0005180655210541838, + "loss": 0.82447004, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.17041016, + "step": 2619, + "time_per_iteration": 2.5832765102386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086664, + "balance_loss_mlp": 1.06910443, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.09602250816000424, + "language_loss": 0.8361724, + "learning_rate": 0.0005177541786757527, + "loss": 0.8470391, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.17565918, + "step": 2620, + "time_per_iteration": 2.8272600173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081312, + "balance_loss_mlp": 1.0633707, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.08634316495635827, + "language_loss": 0.82817882, + "learning_rate": 0.000517442829404538, + "loss": 0.838992, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.17956543, + "step": 2621, + "time_per_iteration": 3.0231099128723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108588, + "balance_loss_mlp": 1.06736684, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.07086048560872778, + "language_loss": 0.87109387, + "learning_rate": 0.0005171314733614166, + "loss": 0.88195264, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.18505859, + "step": 2622, + "time_per_iteration": 2.924490213394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092184, + "balance_loss_mlp": 1.07450485, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.09670552238526126, + "language_loss": 0.78441215, + "learning_rate": 0.0005168201106672671, + "loss": 0.79533398, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.17700195, + "step": 2623, + "time_per_iteration": 2.7627530097961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081433, + "balance_loss_mlp": 1.06351626, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.07080566946451637, + "language_loss": 0.8469494, + "learning_rate": 0.0005165087414429717, + "loss": 0.85776377, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.17932129, + "step": 2624, + "time_per_iteration": 2.6216189861297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078588, + "balance_loss_mlp": 1.06013489, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.07518378231968396, + "language_loss": 0.83469629, + "learning_rate": 0.0005161973658094144, + "loss": 0.84548217, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.18444824, + "step": 2625, + "time_per_iteration": 2.686030864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077122, + "balance_loss_mlp": 1.05919266, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.07052814404413787, + "language_loss": 0.82367003, + "learning_rate": 0.000515885983887482, + "loss": 0.83444118, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.17944336, + "step": 2626, + "time_per_iteration": 2.742265224456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073389, + "balance_loss_mlp": 1.05478024, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.0761715011076948, + "language_loss": 0.84318763, + "learning_rate": 0.0005155745957980636, + "loss": 0.85392147, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.18615723, + "step": 2627, + "time_per_iteration": 2.6049954891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074823, + "balance_loss_mlp": 1.05586839, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.07614118511738227, + "language_loss": 0.88045084, + "learning_rate": 0.000515263201662051, + "loss": 0.89119911, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.18945312, + "step": 2628, + "time_per_iteration": 2.7101621627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084597, + "balance_loss_mlp": 1.06617892, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.07415998964954142, + "language_loss": 0.82280606, + "learning_rate": 0.0005149518016003378, + "loss": 0.83365202, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.1842041, + "step": 2629, + "time_per_iteration": 3.194669723510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080493, + "balance_loss_mlp": 1.06227767, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.07616905133259881, + "language_loss": 0.8214519, + "learning_rate": 0.0005146403957338206, + "loss": 0.83225679, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.18212891, + "step": 2630, + "time_per_iteration": 2.6495327949523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092259, + "balance_loss_mlp": 1.07468796, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.06296513552488332, + "language_loss": 0.81962919, + "learning_rate": 0.0005143289841833975, + "loss": 0.8305518, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.17578125, + "step": 2631, + "time_per_iteration": 2.8716421127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.07512259, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.0779936416436138, + "language_loss": 0.82076275, + "learning_rate": 0.0005140175670699696, + "loss": 0.83168757, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.17382812, + "step": 2632, + "time_per_iteration": 2.6159043312072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.07069623, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.053505876641590386, + "language_loss": 0.82692468, + "learning_rate": 0.0005137061445144395, + "loss": 0.83781052, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.17895508, + "step": 2633, + "time_per_iteration": 2.9435369968414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102566, + "balance_loss_mlp": 1.08499455, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.07429237358898076, + "language_loss": 0.86728698, + "learning_rate": 0.000513394716637712, + "loss": 0.87831259, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.17590332, + "step": 2634, + "time_per_iteration": 2.785621404647827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_mlp": 1.02165747, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.025420781551357425, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80223238, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.09863281, + "step": 2635, + "time_per_iteration": 4.87060809135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103723, + "balance_loss_mlp": 1.08666396, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.0808554701524121, + "language_loss": 0.8102541, + "learning_rate": 0.0005127718454042958, + "loss": 0.82129133, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.1706543, + "step": 2636, + "time_per_iteration": 2.8784031867980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102685, + "balance_loss_mlp": 1.08523273, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.07186288747403746, + "language_loss": 0.84171808, + "learning_rate": 0.0005124604022894269, + "loss": 0.85274494, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.17468262, + "step": 2637, + "time_per_iteration": 2.9495620727539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018568, + "balance_loss_mlp": 1.00903082, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.013467544944548519, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78206789, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.09521484, + "step": 2638, + "time_per_iteration": 4.841961145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100977, + "balance_loss_mlp": 1.08402538, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.0754060533252176, + "language_loss": 0.83016658, + "learning_rate": 0.0005118375016679325, + "loss": 0.84117633, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.16967773, + "step": 2639, + "time_per_iteration": 2.7659313678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094497, + "balance_loss_mlp": 1.07784295, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.08036414838520123, + "language_loss": 0.80592823, + "learning_rate": 0.0005115260444031382, + "loss": 0.81687325, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.16662598, + "step": 2640, + "time_per_iteration": 2.6009633541107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012329, + "balance_loss_mlp": 1.00350785, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011999730841431432, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79744148, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.08837891, + "step": 2641, + "time_per_iteration": 4.949390411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097313, + "balance_loss_mlp": 1.08012342, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.07347538330964974, + "language_loss": 0.87067777, + "learning_rate": 0.0005109031165700483, + "loss": 0.88165087, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.17211914, + "step": 2642, + "time_per_iteration": 2.571359634399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089013, + "balance_loss_mlp": 1.07212138, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.07982577059913512, + "language_loss": 0.8353101, + "learning_rate": 0.0005105916462435945, + "loss": 0.84620023, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.16894531, + "step": 2643, + "time_per_iteration": 2.853332996368408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090538, + "balance_loss_mlp": 1.07358634, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.06767023016464803, + "language_loss": 0.85332114, + "learning_rate": 0.0005102801718050989, + "loss": 0.86422646, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.16967773, + "step": 2644, + "time_per_iteration": 2.71907377243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085318, + "balance_loss_mlp": 1.06869972, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.08980112743883228, + "language_loss": 0.89031243, + "learning_rate": 0.0005099686933754867, + "loss": 0.9011656, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.16625977, + "step": 2645, + "time_per_iteration": 2.759768009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108757, + "balance_loss_mlp": 1.07075, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.07519563415405216, + "language_loss": 0.84095073, + "learning_rate": 0.0005096572110756845, + "loss": 0.85182643, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.16833496, + "step": 2646, + "time_per_iteration": 2.742478132247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.06656277, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.06876057003625125, + "language_loss": 0.85465425, + "learning_rate": 0.0005093457250266205, + "loss": 0.86549312, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.17333984, + "step": 2647, + "time_per_iteration": 2.762909173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091086, + "balance_loss_mlp": 1.073717, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.1044353617825215, + "language_loss": 0.8341682, + "learning_rate": 0.000509034235349224, + "loss": 0.84507906, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.1739502, + "step": 2648, + "time_per_iteration": 2.726165533065796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109791, + "balance_loss_mlp": 1.08109021, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.07313436933557896, + "language_loss": 0.81423604, + "learning_rate": 0.0005087227421644266, + "loss": 0.8252151, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.16821289, + "step": 2649, + "time_per_iteration": 2.753390312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108015, + "balance_loss_mlp": 1.09102726, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.0718220857310726, + "language_loss": 0.85905892, + "learning_rate": 0.0005084112455931602, + "loss": 0.87013906, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.17004395, + "step": 2650, + "time_per_iteration": 2.5981361865997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116436, + "balance_loss_mlp": 1.0991627, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.0710139819724768, + "language_loss": 0.84867871, + "learning_rate": 0.0005080997457563586, + "loss": 0.85984302, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.17297363, + "step": 2651, + "time_per_iteration": 2.5604488849639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125127, + "balance_loss_mlp": 1.10802007, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.08475984872157578, + "language_loss": 0.78772122, + "learning_rate": 0.0005077882427749569, + "loss": 0.79897249, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.17114258, + "step": 2652, + "time_per_iteration": 2.5588836669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137152, + "balance_loss_mlp": 1.12011659, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.0878101507805391, + "language_loss": 0.84672785, + "learning_rate": 0.0005074767367698913, + "loss": 0.85809934, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.17041016, + "step": 2653, + "time_per_iteration": 2.7424826622009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113443, + "balance_loss_mlp": 1.11758542, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.10879937034210539, + "language_loss": 0.83426005, + "learning_rate": 0.0005071652278620988, + "loss": 0.8456043, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.16845703, + "step": 2654, + "time_per_iteration": 3.09969162940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124784, + "balance_loss_mlp": 1.10785651, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.10475987580925356, + "language_loss": 0.83118153, + "learning_rate": 0.0005068537161725186, + "loss": 0.8424294, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.16943359, + "step": 2655, + "time_per_iteration": 2.82289719581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116916, + "balance_loss_mlp": 1.09999979, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.07925993280329827, + "language_loss": 0.84691739, + "learning_rate": 0.0005065422018220893, + "loss": 0.85808647, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.16931152, + "step": 2656, + "time_per_iteration": 2.8794078826904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112009, + "balance_loss_mlp": 1.09535527, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.07178639525503218, + "language_loss": 0.80310833, + "learning_rate": 0.0005062306849317521, + "loss": 0.81422836, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.16662598, + "step": 2657, + "time_per_iteration": 2.814025402069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110163, + "balance_loss_mlp": 1.09374762, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.09425319021973573, + "language_loss": 0.83069956, + "learning_rate": 0.0005059191656224487, + "loss": 0.84180123, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.16418457, + "step": 2658, + "time_per_iteration": 2.7602522373199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110977, + "balance_loss_mlp": 1.09316397, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.10010645818095278, + "language_loss": 0.89003229, + "learning_rate": 0.0005056076440151212, + "loss": 0.90113008, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.1661377, + "step": 2659, + "time_per_iteration": 2.7027831077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071712, + "balance_loss_mlp": 1.06413066, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.039772151853185514, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77359831, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.07568359, + "step": 2660, + "time_per_iteration": 4.856590032577515 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115167, + "balance_loss_mlp": 1.09887075, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06725256479668422, + "language_loss": 0.86826003, + "learning_rate": 0.0005049845943901691, + "loss": 0.87941164, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.16296387, + "step": 2661, + "time_per_iteration": 2.8570423126220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122122, + "balance_loss_mlp": 1.10631514, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.0894536064907193, + "language_loss": 0.8667441, + "learning_rate": 0.0005046730666144338, + "loss": 0.87796533, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.15795898, + "step": 2662, + "time_per_iteration": 2.883822202682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119148, + "balance_loss_mlp": 1.10315049, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.06658438993973123, + "language_loss": 0.87964702, + "learning_rate": 0.0005043615370244532, + "loss": 0.8908385, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.15991211, + "step": 2663, + "time_per_iteration": 3.388521671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_mlp": 1.02103686, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.01281563800895277, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79272962, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.07519531, + "step": 2664, + "time_per_iteration": 4.6337666511535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119325, + "balance_loss_mlp": 1.10361338, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.058968241204554794, + "language_loss": 0.85154796, + "learning_rate": 0.0005037384728855425, + "loss": 0.86274123, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.15698242, + "step": 2665, + "time_per_iteration": 2.8316938877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116454, + "balance_loss_mlp": 1.10032547, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.07313815870373463, + "language_loss": 0.8427707, + "learning_rate": 0.0005034269385785075, + "loss": 0.85393524, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.16125488, + "step": 2666, + "time_per_iteration": 2.705953359603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119966, + "balance_loss_mlp": 1.10405147, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.09131160106886373, + "language_loss": 0.84140623, + "learning_rate": 0.0005031154029410168, + "loss": 0.85260594, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.15905762, + "step": 2667, + "time_per_iteration": 2.5483505725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121112, + "balance_loss_mlp": 1.10497081, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.07350853386407429, + "language_loss": 0.86393219, + "learning_rate": 0.0005028038660940197, + "loss": 0.87514335, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.16137695, + "step": 2668, + "time_per_iteration": 2.5729174613952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117082, + "balance_loss_mlp": 1.10103667, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.06973928207648594, + "language_loss": 0.84257567, + "learning_rate": 0.0005024923281584648, + "loss": 0.85374653, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.16040039, + "step": 2669, + "time_per_iteration": 2.695422410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112964, + "balance_loss_mlp": 1.11378479, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.07121106891997668, + "language_loss": 0.82480651, + "learning_rate": 0.0005021807892553026, + "loss": 0.8361029, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.15844727, + "step": 2670, + "time_per_iteration": 2.751401662826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129611, + "balance_loss_mlp": 1.11330318, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.07354407823714339, + "language_loss": 0.84572917, + "learning_rate": 0.0005018692495054828, + "loss": 0.85702527, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.16308594, + "step": 2671, + "time_per_iteration": 2.757593870162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123606, + "balance_loss_mlp": 1.10785806, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.06661441717787603, + "language_loss": 0.80650961, + "learning_rate": 0.0005015577090299561, + "loss": 0.81774569, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.15734863, + "step": 2672, + "time_per_iteration": 2.693725347518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_mlp": 1.09435153, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.07298787487316409, + "language_loss": 0.86515582, + "learning_rate": 0.0005012461679496729, + "loss": 0.87626314, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.16381836, + "step": 2673, + "time_per_iteration": 2.6318869590759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111417, + "balance_loss_mlp": 1.09533608, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.07740296935823926, + "language_loss": 0.87230647, + "learning_rate": 0.0005009346263855848, + "loss": 0.88342059, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.1607666, + "step": 2674, + "time_per_iteration": 2.6561901569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108221, + "balance_loss_mlp": 1.09159088, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.0608007463380774, + "language_loss": 0.83338469, + "learning_rate": 0.0005006230844586422, + "loss": 0.84446692, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.16638184, + "step": 2675, + "time_per_iteration": 2.7956371307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.09186745, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.06956599587127472, + "language_loss": 0.78915107, + "learning_rate": 0.0005003115422897968, + "loss": 0.80023432, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.16467285, + "step": 2676, + "time_per_iteration": 2.8026392459869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098426, + "balance_loss_mlp": 1.08178461, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.06380905094740742, + "language_loss": 0.87044096, + "learning_rate": 0.0005, + "loss": 0.8814252, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.16650391, + "step": 2677, + "time_per_iteration": 2.6397616863250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096356, + "balance_loss_mlp": 1.07940435, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.06972488542821374, + "language_loss": 0.79243249, + "learning_rate": 0.0004996884577102033, + "loss": 0.80339611, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.16967773, + "step": 2678, + "time_per_iteration": 3.1194515228271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109136, + "balance_loss_mlp": 1.07438445, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.07627965924369287, + "language_loss": 0.84695083, + "learning_rate": 0.000499376915541358, + "loss": 0.85786444, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.16992188, + "step": 2679, + "time_per_iteration": 2.7068095207214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089943, + "balance_loss_mlp": 1.07359934, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.06818096885322372, + "language_loss": 0.81243503, + "learning_rate": 0.0004990653736144155, + "loss": 0.8233344, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.16345215, + "step": 2680, + "time_per_iteration": 2.8939812183380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108849, + "balance_loss_mlp": 1.07127619, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.06989870799279192, + "language_loss": 0.85872787, + "learning_rate": 0.0004987538320503271, + "loss": 0.86961281, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.17236328, + "step": 2681, + "time_per_iteration": 2.5216612815856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082053, + "balance_loss_mlp": 1.06468463, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.08598338754099338, + "language_loss": 0.82912159, + "learning_rate": 0.0004984422909700442, + "loss": 0.8399421, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.17382812, + "step": 2682, + "time_per_iteration": 2.665601968765259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081809, + "balance_loss_mlp": 1.06371331, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.06868623883512981, + "language_loss": 0.8358953, + "learning_rate": 0.0004981307504945173, + "loss": 0.84671342, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.1809082, + "step": 2683, + "time_per_iteration": 2.744506597518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084499, + "balance_loss_mlp": 1.06714213, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.07139371766694287, + "language_loss": 0.89118385, + "learning_rate": 0.0004978192107446976, + "loss": 0.9020288, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.17370605, + "step": 2684, + "time_per_iteration": 2.840625762939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107857, + "balance_loss_mlp": 1.06075978, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.07781566774681065, + "language_loss": 0.87333429, + "learning_rate": 0.0004975076718415353, + "loss": 0.88411999, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.17810059, + "step": 2685, + "time_per_iteration": 2.6297128200531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076856, + "balance_loss_mlp": 1.05923653, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.07734898237902697, + "language_loss": 0.90289825, + "learning_rate": 0.0004971961339059806, + "loss": 0.91366684, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.17626953, + "step": 2686, + "time_per_iteration": 2.5235214233398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079451, + "balance_loss_mlp": 1.06149805, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.08309998288602231, + "language_loss": 0.84119761, + "learning_rate": 0.0004968845970589832, + "loss": 0.85199213, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.17956543, + "step": 2687, + "time_per_iteration": 2.6999969482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085317, + "balance_loss_mlp": 1.06760216, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.0817039791962864, + "language_loss": 0.84468675, + "learning_rate": 0.0004965730614214926, + "loss": 0.85553992, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.17724609, + "step": 2688, + "time_per_iteration": 2.658827066421509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078556, + "balance_loss_mlp": 1.06094825, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.07334441433702203, + "language_loss": 0.85342443, + "learning_rate": 0.0004962615271144576, + "loss": 0.86421001, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.17626953, + "step": 2689, + "time_per_iteration": 2.50878643989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.06994319, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.12467871415324963, + "language_loss": 0.82284343, + "learning_rate": 0.0004959499942588264, + "loss": 0.83371305, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.17028809, + "step": 2690, + "time_per_iteration": 2.9249496459960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_mlp": 1.03822827, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.03199266467607697, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79247075, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.08837891, + "step": 2691, + "time_per_iteration": 4.82594108581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090274, + "balance_loss_mlp": 1.07309616, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.07423408614425925, + "language_loss": 0.85369182, + "learning_rate": 0.0004953269333855661, + "loss": 0.86459452, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.17175293, + "step": 2692, + "time_per_iteration": 2.777863025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093446, + "balance_loss_mlp": 1.07593369, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.08941680356551608, + "language_loss": 0.84251738, + "learning_rate": 0.0004950154056098309, + "loss": 0.85345179, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.17529297, + "step": 2693, + "time_per_iteration": 2.7481398582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097937, + "balance_loss_mlp": 1.08010364, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.07099923409869693, + "language_loss": 0.84394872, + "learning_rate": 0.0004947038797692867, + "loss": 0.85492814, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.1784668, + "step": 2694, + "time_per_iteration": 2.8453128337860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113818, + "balance_loss_mlp": 1.096771, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.06154827687851128, + "language_loss": 0.77520609, + "learning_rate": 0.0004943923559848789, + "loss": 0.78634429, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.1706543, + "step": 2695, + "time_per_iteration": 2.841853141784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124321, + "balance_loss_mlp": 1.10654736, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.06645104429405103, + "language_loss": 0.90406942, + "learning_rate": 0.0004940808343775515, + "loss": 0.91531265, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.17773438, + "step": 2696, + "time_per_iteration": 2.749504327774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118087, + "balance_loss_mlp": 1.10027719, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.07841169466401897, + "language_loss": 0.82063687, + "learning_rate": 0.0004937693150682479, + "loss": 0.83181769, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.17810059, + "step": 2697, + "time_per_iteration": 2.5522847175598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118895, + "balance_loss_mlp": 1.10168159, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.07394243959698338, + "language_loss": 0.76709116, + "learning_rate": 0.0004934577981779107, + "loss": 0.77828008, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.17224121, + "step": 2698, + "time_per_iteration": 2.72316312789917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115498, + "balance_loss_mlp": 1.09879637, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.0912267088784467, + "language_loss": 0.8119272, + "learning_rate": 0.0004931462838274817, + "loss": 0.82308215, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.16711426, + "step": 2699, + "time_per_iteration": 2.8209919929504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107121, + "balance_loss_mlp": 1.08981156, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.10066489144579434, + "language_loss": 0.83903617, + "learning_rate": 0.0004928347721379011, + "loss": 0.85010743, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.17333984, + "step": 2700, + "time_per_iteration": 2.679414749145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098221, + "balance_loss_mlp": 1.08088803, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.06308374672073903, + "language_loss": 0.82055807, + "learning_rate": 0.0004925232632301089, + "loss": 0.83154029, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.17346191, + "step": 2701, + "time_per_iteration": 2.5568413734436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086175, + "balance_loss_mlp": 1.06934261, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.07257701027520803, + "language_loss": 0.79591668, + "learning_rate": 0.0004922117572250431, + "loss": 0.80677843, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.16845703, + "step": 2702, + "time_per_iteration": 2.6907496452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085203, + "balance_loss_mlp": 1.06819224, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.08909916825126464, + "language_loss": 0.80501723, + "learning_rate": 0.0004919002542436414, + "loss": 0.81586921, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.17016602, + "step": 2703, + "time_per_iteration": 2.8154964447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.07078612, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.07574293506029897, + "language_loss": 0.8094272, + "learning_rate": 0.0004915887544068399, + "loss": 0.82030636, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.17138672, + "step": 2704, + "time_per_iteration": 2.6723296642303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080297, + "balance_loss_mlp": 1.06322646, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.08223729103851085, + "language_loss": 0.78410661, + "learning_rate": 0.0004912772578355736, + "loss": 0.79490954, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.1706543, + "step": 2705, + "time_per_iteration": 2.904359817504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080431, + "balance_loss_mlp": 1.06288326, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.0867272148609526, + "language_loss": 0.82534099, + "learning_rate": 0.000490965764650776, + "loss": 0.83614528, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.17553711, + "step": 2706, + "time_per_iteration": 2.893965005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082267, + "balance_loss_mlp": 1.06508923, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.08899008608425168, + "language_loss": 0.82646501, + "learning_rate": 0.0004906542749733798, + "loss": 0.83728766, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.171875, + "step": 2707, + "time_per_iteration": 3.642857313156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081324, + "balance_loss_mlp": 1.06468248, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.06383765372803735, + "language_loss": 0.85145414, + "learning_rate": 0.0004903427889243156, + "loss": 0.86226737, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.16650391, + "step": 2708, + "time_per_iteration": 2.8898375034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091262, + "balance_loss_mlp": 1.074036, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.07905445780966364, + "language_loss": 0.85149866, + "learning_rate": 0.0004900313066245134, + "loss": 0.86241126, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.17236328, + "step": 2709, + "time_per_iteration": 2.65574049949646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088104, + "balance_loss_mlp": 1.07130718, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.07812284997006956, + "language_loss": 0.80880928, + "learning_rate": 0.0004897198281949012, + "loss": 0.81969029, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.16796875, + "step": 2710, + "time_per_iteration": 2.672153949737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103259, + "balance_loss_mlp": 1.08604503, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.07691692452987973, + "language_loss": 0.77799213, + "learning_rate": 0.0004894083537564057, + "loss": 0.78902471, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.17236328, + "step": 2711, + "time_per_iteration": 2.7532706260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104375, + "balance_loss_mlp": 1.08732796, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.07306223578012608, + "language_loss": 0.80945504, + "learning_rate": 0.0004890968834299519, + "loss": 0.82049876, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.1706543, + "step": 2712, + "time_per_iteration": 2.7456612586975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113403, + "balance_loss_mlp": 1.09663057, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.06414784694166918, + "language_loss": 0.7858941, + "learning_rate": 0.0004887854173364633, + "loss": 0.79702818, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.16784668, + "step": 2713, + "time_per_iteration": 2.731410503387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116912, + "balance_loss_mlp": 1.10033011, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.062429546921528134, + "language_loss": 0.8127901, + "learning_rate": 0.0004884739555968617, + "loss": 0.82395923, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.16589355, + "step": 2714, + "time_per_iteration": 2.8288521766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024153, + "balance_loss_mlp": 1.01604629, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.017358883808072843, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80001205, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.08105469, + "step": 2715, + "time_per_iteration": 5.007716417312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124661, + "balance_loss_mlp": 1.10728037, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.06973573346877397, + "language_loss": 0.86611319, + "learning_rate": 0.0004878510456629992, + "loss": 0.87735981, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.17407227, + "step": 2716, + "time_per_iteration": 3.006253957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131765, + "balance_loss_mlp": 1.11461031, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.07218030120275976, + "language_loss": 0.85169446, + "learning_rate": 0.00048753959771057314, + "loss": 0.86301208, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.17175293, + "step": 2717, + "time_per_iteration": 2.6976563930511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121586, + "balance_loss_mlp": 1.10383558, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.07681806180198643, + "language_loss": 0.82615161, + "learning_rate": 0.0004872281545957044, + "loss": 0.83736753, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.1776123, + "step": 2718, + "time_per_iteration": 2.8015332221984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117931, + "balance_loss_mlp": 1.10027635, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.058351443586734386, + "language_loss": 0.85597366, + "learning_rate": 0.0004869167164393055, + "loss": 0.86715293, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.17675781, + "step": 2719, + "time_per_iteration": 2.9708495140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116486, + "balance_loss_mlp": 1.09911728, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.06620613765458017, + "language_loss": 0.88742125, + "learning_rate": 0.00048660528336228793, + "loss": 0.89858615, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.17382812, + "step": 2720, + "time_per_iteration": 2.7995879650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106216, + "balance_loss_mlp": 1.08846569, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.06179859794056996, + "language_loss": 0.90307331, + "learning_rate": 0.0004862938554855606, + "loss": 0.91413546, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.1776123, + "step": 2721, + "time_per_iteration": 2.8321540355682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104126, + "balance_loss_mlp": 1.08690071, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.07085532730134622, + "language_loss": 0.85930234, + "learning_rate": 0.0004859824329300304, + "loss": 0.87034363, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.17248535, + "step": 2722, + "time_per_iteration": 2.6302812099456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110407, + "balance_loss_mlp": 1.08649826, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.07263306317055565, + "language_loss": 0.83477378, + "learning_rate": 0.00048567101581660244, + "loss": 0.84581447, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.17590332, + "step": 2723, + "time_per_iteration": 2.68910813331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109903, + "balance_loss_mlp": 1.08181643, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.11439626446879424, + "language_loss": 0.87057537, + "learning_rate": 0.00048535960426617956, + "loss": 0.88156569, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.17236328, + "step": 2724, + "time_per_iteration": 2.622817039489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.07238674, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.061793488209652164, + "language_loss": 0.8146565, + "learning_rate": 0.0004850481983996621, + "loss": 0.8255589, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.17871094, + "step": 2725, + "time_per_iteration": 2.7661449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.07968855, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.1002744758401102, + "language_loss": 0.87726384, + "learning_rate": 0.0004847367983379492, + "loss": 0.8882367, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.17602539, + "step": 2726, + "time_per_iteration": 2.501094341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096174, + "balance_loss_mlp": 1.0795207, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.06877444759134967, + "language_loss": 0.78732175, + "learning_rate": 0.00048442540420193643, + "loss": 0.79828346, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.16662598, + "step": 2727, + "time_per_iteration": 2.9280529022216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091328, + "balance_loss_mlp": 1.07391191, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.07855483173762376, + "language_loss": 0.79334521, + "learning_rate": 0.0004841140161125182, + "loss": 0.80425853, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.17431641, + "step": 2728, + "time_per_iteration": 3.626858711242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093412, + "balance_loss_mlp": 1.07654381, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.08285412332857332, + "language_loss": 0.8463819, + "learning_rate": 0.0004838026341905857, + "loss": 0.85731602, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.16870117, + "step": 2729, + "time_per_iteration": 2.7793312072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088713, + "balance_loss_mlp": 1.07182097, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.07499858641848273, + "language_loss": 0.85196304, + "learning_rate": 0.00048349125855702844, + "loss": 0.86285013, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.16906738, + "step": 2730, + "time_per_iteration": 2.8079419136047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092888, + "balance_loss_mlp": 1.07605541, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.07740216541040414, + "language_loss": 0.81396556, + "learning_rate": 0.00048317988933273287, + "loss": 0.82489449, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.16845703, + "step": 2731, + "time_per_iteration": 2.772430419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084718, + "balance_loss_mlp": 1.06807661, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.18745226220584338, + "language_loss": 0.82080007, + "learning_rate": 0.00048286852663858367, + "loss": 0.83164728, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.16650391, + "step": 2732, + "time_per_iteration": 2.9268972873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087343, + "balance_loss_mlp": 1.07036781, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.08325512934533874, + "language_loss": 0.8380754, + "learning_rate": 0.000482557170595462, + "loss": 0.84894884, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.16992188, + "step": 2733, + "time_per_iteration": 2.8951096534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093841, + "balance_loss_mlp": 1.07677019, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.08900957978988387, + "language_loss": 0.87469298, + "learning_rate": 0.0004822458213242475, + "loss": 0.88563132, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.17089844, + "step": 2734, + "time_per_iteration": 2.5620529651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110054, + "balance_loss_mlp": 1.09249437, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.0633406501514696, + "language_loss": 0.85937345, + "learning_rate": 0.00048193447894581627, + "loss": 0.87047398, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.17565918, + "step": 2735, + "time_per_iteration": 3.103132486343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_mlp": 1.10083008, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.0756952830822362, + "language_loss": 0.87890029, + "learning_rate": 0.00048162314358104243, + "loss": 0.89008415, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.17565918, + "step": 2736, + "time_per_iteration": 2.6416001319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117726, + "balance_loss_mlp": 1.10027409, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.09251963370546762, + "language_loss": 0.83179659, + "learning_rate": 0.0004813118153507969, + "loss": 0.84297383, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.17468262, + "step": 2737, + "time_per_iteration": 2.7370142936706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078212, + "balance_loss_mlp": 1.0679127, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.03576440897911325, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83525336, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.10302734, + "step": 2738, + "time_per_iteration": 4.797177076339722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110775, + "balance_loss_mlp": 1.08933258, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.07588810399495584, + "language_loss": 0.83266842, + "learning_rate": 0.00048068918077736163, + "loss": 0.84374589, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.18408203, + "step": 2739, + "time_per_iteration": 3.2253060340881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109875, + "balance_loss_mlp": 1.0805707, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.07650809384335877, + "language_loss": 0.81149924, + "learning_rate": 0.0004803778746759001, + "loss": 0.82248676, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.18188477, + "step": 2740, + "time_per_iteration": 2.917982578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.07380056, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.08493152657291815, + "language_loss": 0.81563872, + "learning_rate": 0.00048006657619242317, + "loss": 0.82655203, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.17553711, + "step": 2741, + "time_per_iteration": 2.6491029262542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_mlp": 1.0661335, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.09642753382189671, + "language_loss": 0.78573406, + "learning_rate": 0.00047975528544778775, + "loss": 0.79657394, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.17858887, + "step": 2742, + "time_per_iteration": 2.6600565910339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080617, + "balance_loss_mlp": 1.06256926, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.07268225763303592, + "language_loss": 0.88256997, + "learning_rate": 0.00047944400256284754, + "loss": 0.89337611, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.18041992, + "step": 2743, + "time_per_iteration": 2.7662084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108031, + "balance_loss_mlp": 1.06228542, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.07011617815169531, + "language_loss": 0.79666251, + "learning_rate": 0.0004791327276584532, + "loss": 0.80746561, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.18041992, + "step": 2744, + "time_per_iteration": 2.835545301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075943, + "balance_loss_mlp": 1.05737054, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.08121623581547996, + "language_loss": 0.80470204, + "learning_rate": 0.00047882146085545264, + "loss": 0.81546152, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.18566895, + "step": 2745, + "time_per_iteration": 2.690206289291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_mlp": 1.02781987, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.02647915133994321, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76439977, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.09765625, + "step": 2746, + "time_per_iteration": 5.020122766494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074184, + "balance_loss_mlp": 1.05564749, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.0832805570330896, + "language_loss": 0.79321563, + "learning_rate": 0.00047819895203700684, + "loss": 0.80395758, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.18530273, + "step": 2747, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_mlp": 1.02084875, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.025219008400043496, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76542532, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.09228516, + "step": 2748, + "time_per_iteration": 4.670547246932983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.04841685, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.08023961077007181, + "language_loss": 0.88480437, + "learning_rate": 0.0004775764770742277, + "loss": 0.89546895, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.18041992, + "step": 2749, + "time_per_iteration": 2.8597028255462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074765, + "balance_loss_mlp": 1.05651426, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.0872100074417497, + "language_loss": 0.86519742, + "learning_rate": 0.00047726525259079777, + "loss": 0.87594503, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.18237305, + "step": 2750, + "time_per_iteration": 2.7900798320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080608, + "balance_loss_mlp": 1.06233358, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.10808949355702925, + "language_loss": 0.88474864, + "learning_rate": 0.0004769540369337798, + "loss": 0.89555472, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.18261719, + "step": 2751, + "time_per_iteration": 2.7448270320892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083505, + "balance_loss_mlp": 1.0650394, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06879132043127602, + "language_loss": 0.85886008, + "learning_rate": 0.00047664283022399794, + "loss": 0.86969519, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.18469238, + "step": 2752, + "time_per_iteration": 2.8719866275787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.06261468, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.0740043611556158, + "language_loss": 0.81022358, + "learning_rate": 0.00047633163258227376, + "loss": 0.82102704, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.17736816, + "step": 2753, + "time_per_iteration": 2.904007911682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_mlp": 1.06734776, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.07290364739094941, + "language_loss": 0.85516405, + "learning_rate": 0.0004760204441294247, + "loss": 0.86601269, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.17529297, + "step": 2754, + "time_per_iteration": 2.728672504425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095448, + "balance_loss_mlp": 1.07741165, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.0727695026629463, + "language_loss": 0.86100507, + "learning_rate": 0.00047570926498626486, + "loss": 0.87195957, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.18066406, + "step": 2755, + "time_per_iteration": 2.726902484893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099745, + "balance_loss_mlp": 1.08242369, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.05921570741986168, + "language_loss": 0.81395233, + "learning_rate": 0.00047539809527360474, + "loss": 0.82494974, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.17333984, + "step": 2756, + "time_per_iteration": 2.87945556640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115628, + "balance_loss_mlp": 1.09774637, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.05551434768366506, + "language_loss": 0.82287431, + "learning_rate": 0.0004750869351122511, + "loss": 0.83403063, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.17883301, + "step": 2757, + "time_per_iteration": 3.0493249893188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112447, + "balance_loss_mlp": 1.10749459, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.0694425557197165, + "language_loss": 0.82020032, + "learning_rate": 0.00047477578462300685, + "loss": 0.83144498, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.16992188, + "step": 2758, + "time_per_iteration": 2.7602713108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123309, + "balance_loss_mlp": 1.10578477, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.07804964416900076, + "language_loss": 0.79339695, + "learning_rate": 0.0004744646439266718, + "loss": 0.80463004, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.17541504, + "step": 2759, + "time_per_iteration": 3.010812997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119146, + "balance_loss_mlp": 1.10195613, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.056360612774155563, + "language_loss": 0.92028886, + "learning_rate": 0.000474153513144041, + "loss": 0.93148029, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.17199707, + "step": 2760, + "time_per_iteration": 2.9704673290252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128006, + "balance_loss_mlp": 1.11117363, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.08001771173719906, + "language_loss": 0.86726296, + "learning_rate": 0.00047384239239590633, + "loss": 0.87854302, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.16845703, + "step": 2761, + "time_per_iteration": 2.891458749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129372, + "balance_loss_mlp": 1.11169338, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06781273866770807, + "language_loss": 0.88723642, + "learning_rate": 0.0004735312818030556, + "loss": 0.89853013, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.17700195, + "step": 2762, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127323, + "balance_loss_mlp": 1.11076498, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.06505824064287292, + "language_loss": 0.82414401, + "learning_rate": 0.0004732201814862727, + "loss": 0.83541727, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.16564941, + "step": 2763, + "time_per_iteration": 2.726468563079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123829, + "balance_loss_mlp": 1.10723543, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.06470267434285343, + "language_loss": 0.81489587, + "learning_rate": 0.0004729090915663373, + "loss": 0.82613409, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.16601562, + "step": 2764, + "time_per_iteration": 2.8475723266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123779, + "balance_loss_mlp": 1.10759008, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.11068637871952317, + "language_loss": 0.85001844, + "learning_rate": 0.00047259801216402534, + "loss": 0.86125624, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.16186523, + "step": 2765, + "time_per_iteration": 2.540780544281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116718, + "balance_loss_mlp": 1.10029066, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.07674788190906832, + "language_loss": 0.86407942, + "learning_rate": 0.00047228694340010845, + "loss": 0.87524652, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.16430664, + "step": 2766, + "time_per_iteration": 2.590508460998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.1044749, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.07081285799421494, + "language_loss": 0.85664678, + "learning_rate": 0.0004719758853953544, + "loss": 0.86785722, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.16577148, + "step": 2767, + "time_per_iteration": 3.6536149978637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118047, + "balance_loss_mlp": 1.10160804, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.1001432749586202, + "language_loss": 0.83710611, + "learning_rate": 0.00047166483827052645, + "loss": 0.84828657, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.16442871, + "step": 2768, + "time_per_iteration": 2.437939167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234354, + "balance_loss_mlp": 1.22538948, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.06972612650118978, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78312844, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.08984375, + "step": 2769, + "time_per_iteration": 5.026838779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115645, + "balance_loss_mlp": 1.09895587, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.0780544569178282, + "language_loss": 0.83743083, + "learning_rate": 0.000471042777143682, + "loss": 0.84858727, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.16699219, + "step": 2770, + "time_per_iteration": 3.230933427810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113101, + "balance_loss_mlp": 1.09710324, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.20675341395216595, + "language_loss": 0.79602915, + "learning_rate": 0.0004707317633831707, + "loss": 0.80716014, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.15991211, + "step": 2771, + "time_per_iteration": 2.6368706226348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106292, + "balance_loss_mlp": 1.09012711, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.0712649510509903, + "language_loss": 0.77926189, + "learning_rate": 0.00047042076098559673, + "loss": 0.79032481, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.16162109, + "step": 2772, + "time_per_iteration": 2.633755683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105276, + "balance_loss_mlp": 1.08895612, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.08177633680773212, + "language_loss": 0.74153018, + "learning_rate": 0.00047010977007170174, + "loss": 0.75258291, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.16320801, + "step": 2773, + "time_per_iteration": 3.257364273071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105171, + "balance_loss_mlp": 1.08880353, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.08878543355304569, + "language_loss": 0.8234973, + "learning_rate": 0.00046979879076222334, + "loss": 0.83454895, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.16369629, + "step": 2774, + "time_per_iteration": 2.6948111057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115652, + "balance_loss_mlp": 1.09958255, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.07031279684874672, + "language_loss": 0.84660083, + "learning_rate": 0.0004694878231778939, + "loss": 0.85775733, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.16064453, + "step": 2775, + "time_per_iteration": 3.391101121902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111582, + "balance_loss_mlp": 1.09510732, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.06461927889010362, + "language_loss": 0.84379047, + "learning_rate": 0.0004691768674394423, + "loss": 0.85490632, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.16479492, + "step": 2776, + "time_per_iteration": 2.9977481365203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_mlp": 1.03071785, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.02105469632037268, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85523784, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.09082031, + "step": 2777, + "time_per_iteration": 4.769741535186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_mlp": 1.02591205, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.019005935883373085, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77688694, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.09228516, + "step": 2778, + "time_per_iteration": 4.987689733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118552, + "balance_loss_mlp": 1.10211313, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.06371644955079436, + "language_loss": 0.79125863, + "learning_rate": 0.00046824407250656676, + "loss": 0.80244416, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.16442871, + "step": 2779, + "time_per_iteration": 2.6410112380981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112995, + "balance_loss_mlp": 1.09662735, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.060742687445953125, + "language_loss": 0.83655095, + "learning_rate": 0.0004679331653588161, + "loss": 0.84768081, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.16369629, + "step": 2780, + "time_per_iteration": 2.625710964202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112315, + "balance_loss_mlp": 1.09542346, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.07272998333963254, + "language_loss": 0.85177255, + "learning_rate": 0.0004676222706605147, + "loss": 0.86289573, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.16906738, + "step": 2781, + "time_per_iteration": 2.673433542251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110827, + "balance_loss_mlp": 1.09407806, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.07193058078875894, + "language_loss": 0.85307002, + "learning_rate": 0.0004673113885323626, + "loss": 0.8641783, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.16748047, + "step": 2782, + "time_per_iteration": 2.8941664695739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106993, + "balance_loss_mlp": 1.09025598, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.10372367104553785, + "language_loss": 0.78561115, + "learning_rate": 0.00046700051909505494, + "loss": 0.79668105, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.16748047, + "step": 2783, + "time_per_iteration": 3.2081563472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111085, + "balance_loss_mlp": 1.09330261, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06865237294530599, + "language_loss": 0.83605123, + "learning_rate": 0.000466689662469282, + "loss": 0.84715974, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.17553711, + "step": 2784, + "time_per_iteration": 2.6711413860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104532, + "balance_loss_mlp": 1.08773518, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.08186219318834767, + "language_loss": 0.83921355, + "learning_rate": 0.00046637881877572917, + "loss": 0.85025889, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.16809082, + "step": 2785, + "time_per_iteration": 3.1084179878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094145, + "balance_loss_mlp": 1.07644248, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.07421115565240126, + "language_loss": 0.84573698, + "learning_rate": 0.0004660679881350764, + "loss": 0.85667843, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.17736816, + "step": 2786, + "time_per_iteration": 2.7627315521240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_mlp": 1.02681208, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.02311153951998418, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76644635, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.09667969, + "step": 2787, + "time_per_iteration": 5.0513763427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086082, + "balance_loss_mlp": 1.06855869, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07609779475010685, + "language_loss": 0.77801538, + "learning_rate": 0.0004654463664951667, + "loss": 0.78887624, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.17541504, + "step": 2788, + "time_per_iteration": 3.050717353820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085404, + "balance_loss_mlp": 1.06829762, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06896319927596091, + "language_loss": 0.82818955, + "learning_rate": 0.0004651355757372447, + "loss": 0.83904356, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.17126465, + "step": 2789, + "time_per_iteration": 2.621809244155884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108222, + "balance_loss_mlp": 1.064816, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.06368186458097214, + "language_loss": 0.85671151, + "learning_rate": 0.00046482479851489274, + "loss": 0.86753374, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.17431641, + "step": 2790, + "time_per_iteration": 2.6873245239257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107657, + "balance_loss_mlp": 1.05957103, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.09368235748008798, + "language_loss": 0.77583152, + "learning_rate": 0.00046451403494876525, + "loss": 0.78659725, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.17016602, + "step": 2791, + "time_per_iteration": 2.9352025985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073051, + "balance_loss_mlp": 1.05570602, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.09106511666805264, + "language_loss": 0.84479213, + "learning_rate": 0.0004642032851595111, + "loss": 0.85552263, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.17358398, + "step": 2792, + "time_per_iteration": 2.8460757732391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107606, + "balance_loss_mlp": 1.05853653, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.09557816920928826, + "language_loss": 0.84886861, + "learning_rate": 0.00046389254926777404, + "loss": 0.85962915, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.17541504, + "step": 2793, + "time_per_iteration": 2.8258917331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_mlp": 1.05381024, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.10419489870866282, + "language_loss": 0.78006279, + "learning_rate": 0.0004635818273941926, + "loss": 0.79077744, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.17675781, + "step": 2794, + "time_per_iteration": 3.5380136966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073554, + "balance_loss_mlp": 1.05581546, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.09943669711596623, + "language_loss": 0.81746304, + "learning_rate": 0.0004632711196593997, + "loss": 0.82819855, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.1776123, + "step": 2795, + "time_per_iteration": 2.780565023422241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076704, + "balance_loss_mlp": 1.05881083, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.08810005094672828, + "language_loss": 0.85034251, + "learning_rate": 0.00046296042618402297, + "loss": 0.86110961, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.17907715, + "step": 2796, + "time_per_iteration": 3.099726915359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076408, + "balance_loss_mlp": 1.0591228, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.06043623665913195, + "language_loss": 0.79098737, + "learning_rate": 0.0004626497470886839, + "loss": 0.80175149, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.17297363, + "step": 2797, + "time_per_iteration": 2.975820541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.06634785168506467, + "language_loss": 0.81794053, + "learning_rate": 0.00046233908249399897, + "loss": 0.82876945, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.1706543, + "step": 2798, + "time_per_iteration": 2.7805473804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086325, + "balance_loss_mlp": 1.06942129, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.07252012949911142, + "language_loss": 0.78733051, + "learning_rate": 0.00046202843252057905, + "loss": 0.79819375, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.16906738, + "step": 2799, + "time_per_iteration": 2.666600227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091679, + "balance_loss_mlp": 1.07437015, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.07864108960704319, + "language_loss": 0.83561981, + "learning_rate": 0.00046171779728902896, + "loss": 0.84653658, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.17333984, + "step": 2800, + "time_per_iteration": 2.6010262966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094958, + "balance_loss_mlp": 1.07766032, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.11618067186279732, + "language_loss": 0.85997868, + "learning_rate": 0.000461407176919948, + "loss": 0.87092829, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.17321777, + "step": 2801, + "time_per_iteration": 2.5429272651672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094632, + "balance_loss_mlp": 1.07774007, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.08430832790687283, + "language_loss": 0.84795403, + "learning_rate": 0.00046109657153392997, + "loss": 0.85890037, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.16906738, + "step": 2802, + "time_per_iteration": 2.6846201419830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108893, + "balance_loss_mlp": 1.07168102, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.08650976784842915, + "language_loss": 0.82548422, + "learning_rate": 0.0004607859812515622, + "loss": 0.83637351, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.17272949, + "step": 2803, + "time_per_iteration": 2.5817925930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107998, + "balance_loss_mlp": 1.06338573, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.07563802138366026, + "language_loss": 0.87865353, + "learning_rate": 0.00046047540619342667, + "loss": 0.88945341, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.16601562, + "step": 2804, + "time_per_iteration": 2.6165053844451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_mlp": 1.06755948, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.07064105870143675, + "language_loss": 0.79886174, + "learning_rate": 0.00046016484648009933, + "loss": 0.8097012, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.16394043, + "step": 2805, + "time_per_iteration": 2.725764274597168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084411, + "balance_loss_mlp": 1.06835365, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.07630556738551086, + "language_loss": 0.80977762, + "learning_rate": 0.0004598543022321501, + "loss": 0.82062167, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.16052246, + "step": 2806, + "time_per_iteration": 2.6351540088653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085497, + "balance_loss_mlp": 1.06909394, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.0649087683342786, + "language_loss": 0.79606426, + "learning_rate": 0.0004595437735701433, + "loss": 0.80691922, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.1640625, + "step": 2807, + "time_per_iteration": 2.706876516342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_mlp": 1.06884575, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.08230029830948764, + "language_loss": 0.83224154, + "learning_rate": 0.00045923326061463623, + "loss": 0.84309381, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.16381836, + "step": 2808, + "time_per_iteration": 2.7869887351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.07481909, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.06556687541720137, + "language_loss": 0.81677991, + "learning_rate": 0.00045892276348618113, + "loss": 0.82769144, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.16333008, + "step": 2809, + "time_per_iteration": 3.031975269317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_mlp": 1.03327227, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.026553937309941048, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79302251, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.08154297, + "step": 2810, + "time_per_iteration": 5.018324613571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097064, + "balance_loss_mlp": 1.08080387, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.07012301152495938, + "language_loss": 0.80724698, + "learning_rate": 0.000458301817192603, + "loss": 0.81821764, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.16259766, + "step": 2811, + "time_per_iteration": 2.8826699256896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_mlp": 1.02369976, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.020407688998465158, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81873488, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.08007812, + "step": 2812, + "time_per_iteration": 4.821629762649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094803, + "balance_loss_mlp": 1.07879376, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.09349970811932752, + "language_loss": 0.87107521, + "learning_rate": 0.00045768093565369983, + "loss": 0.88202327, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.16003418, + "step": 2813, + "time_per_iteration": 2.798082113265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096657, + "balance_loss_mlp": 1.08068299, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.08975534837118274, + "language_loss": 0.8179177, + "learning_rate": 0.0004573705194685646, + "loss": 0.82888424, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.15966797, + "step": 2814, + "time_per_iteration": 2.7093122005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07979465, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.07912714625539458, + "language_loss": 0.85284495, + "learning_rate": 0.00045706011983366157, + "loss": 0.86380327, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.16027832, + "step": 2815, + "time_per_iteration": 2.736809253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098087, + "balance_loss_mlp": 1.08264983, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.08398974332430421, + "language_loss": 0.82530612, + "learning_rate": 0.00045674973686949847, + "loss": 0.83628702, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.1541748, + "step": 2816, + "time_per_iteration": 2.531439781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105366, + "balance_loss_mlp": 1.08896279, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.06449066246678943, + "language_loss": 0.85269451, + "learning_rate": 0.0004564393706965766, + "loss": 0.86374819, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.1640625, + "step": 2817, + "time_per_iteration": 3.0000851154327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112663, + "balance_loss_mlp": 1.0963788, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.0725055130640743, + "language_loss": 0.81484962, + "learning_rate": 0.00045612902143539116, + "loss": 0.82597625, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.1628418, + "step": 2818, + "time_per_iteration": 2.5587399005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117291, + "balance_loss_mlp": 1.10132849, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.0784970788328837, + "language_loss": 0.81825465, + "learning_rate": 0.00045581868920642986, + "loss": 0.82942754, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.1595459, + "step": 2819, + "time_per_iteration": 2.4901785850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126012, + "balance_loss_mlp": 1.11031175, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.09999971886905719, + "language_loss": 0.79204059, + "learning_rate": 0.00045550837413017457, + "loss": 0.80330074, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.15686035, + "step": 2820, + "time_per_iteration": 2.616154909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113009, + "balance_loss_mlp": 1.11416399, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06819679789144961, + "language_loss": 0.85130954, + "learning_rate": 0.0004551980763271005, + "loss": 0.86261046, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.15917969, + "step": 2821, + "time_per_iteration": 2.655139923095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125407, + "balance_loss_mlp": 1.10927796, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.0864844698510893, + "language_loss": 0.83889675, + "learning_rate": 0.0004548877959176756, + "loss": 0.85015082, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.16125488, + "step": 2822, + "time_per_iteration": 2.8853647708892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.10281217, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.08050409404863457, + "language_loss": 0.8577252, + "learning_rate": 0.00045457753302236166, + "loss": 0.86891484, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.16149902, + "step": 2823, + "time_per_iteration": 2.6340198516845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098758, + "balance_loss_mlp": 1.08265328, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.09623202069762404, + "language_loss": 0.86938739, + "learning_rate": 0.00045426728776161353, + "loss": 0.88037497, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.16101074, + "step": 2824, + "time_per_iteration": 2.792646646499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093887, + "balance_loss_mlp": 1.07741261, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.09943652396187513, + "language_loss": 0.81526875, + "learning_rate": 0.00045395706025587863, + "loss": 0.82620764, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.16479492, + "step": 2825, + "time_per_iteration": 2.6433639526367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086855, + "balance_loss_mlp": 1.07033277, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.0973793187026711, + "language_loss": 0.82506776, + "learning_rate": 0.00045364685062559843, + "loss": 0.83593631, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.1652832, + "step": 2826, + "time_per_iteration": 2.8686280250549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_mlp": 1.06635737, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.08127433233154835, + "language_loss": 0.91488934, + "learning_rate": 0.0004533366589912067, + "loss": 0.92571723, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.16442871, + "step": 2827, + "time_per_iteration": 2.9782917499542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080524, + "balance_loss_mlp": 1.06361961, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.0854569540023736, + "language_loss": 0.77591085, + "learning_rate": 0.0004530264854731306, + "loss": 0.7867161, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.16918945, + "step": 2828, + "time_per_iteration": 3.036414623260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088214, + "balance_loss_mlp": 1.07106018, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.06060788976216288, + "language_loss": 0.83699155, + "learning_rate": 0.00045271633019179034, + "loss": 0.84787375, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.17163086, + "step": 2829, + "time_per_iteration": 2.7964255809783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085625, + "balance_loss_mlp": 1.06869721, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.07110421348748326, + "language_loss": 0.87746441, + "learning_rate": 0.0004524061932675986, + "loss": 0.88832062, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.16943359, + "step": 2830, + "time_per_iteration": 2.8379290103912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108832, + "balance_loss_mlp": 1.07154715, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.09242408982484117, + "language_loss": 0.86632991, + "learning_rate": 0.00045209607482096125, + "loss": 0.87721312, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.16784668, + "step": 2831, + "time_per_iteration": 3.018829345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.06516385, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.07061707018893328, + "language_loss": 0.84004849, + "learning_rate": 0.0004517859749722772, + "loss": 0.85087609, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.17614746, + "step": 2832, + "time_per_iteration": 2.6852874755859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.06297243, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.0761986265844091, + "language_loss": 0.79247868, + "learning_rate": 0.0004514758938419376, + "loss": 0.8032847, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.17663574, + "step": 2833, + "time_per_iteration": 2.8408279418945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041827, + "balance_loss_mlp": 1.03262424, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.03242070177943237, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77962416, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.09179688, + "step": 2834, + "time_per_iteration": 4.971372842788696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079743, + "balance_loss_mlp": 1.06190884, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.12322372516304661, + "language_loss": 0.83831322, + "learning_rate": 0.00045085578821782175, + "loss": 0.84911072, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.1784668, + "step": 2835, + "time_per_iteration": 2.568789482116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021066, + "balance_loss_mlp": 1.01186323, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.019977782676812977, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77155805, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.09179688, + "step": 2836, + "time_per_iteration": 4.917972803115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05981982, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.07873848801353439, + "language_loss": 0.809609, + "learning_rate": 0.00045023575891159866, + "loss": 0.82039082, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.18347168, + "step": 2837, + "time_per_iteration": 2.723172187805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005469, + "balance_loss_mlp": 0.99645638, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.008784480510471485, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75769281, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.09033203, + "step": 2838, + "time_per_iteration": 4.9626312255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_mlp": 1.06662869, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.06459027340027895, + "language_loss": 0.77977401, + "learning_rate": 0.0004496158068861354, + "loss": 0.79062164, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.18139648, + "step": 2839, + "time_per_iteration": 2.8617422580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089506, + "balance_loss_mlp": 1.0716958, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.06807598587278012, + "language_loss": 0.8025732, + "learning_rate": 0.00044930586015455207, + "loss": 0.81346834, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.17810059, + "step": 2840, + "time_per_iteration": 2.808669328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083083, + "balance_loss_mlp": 1.06519008, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.07651604144285383, + "language_loss": 0.88620353, + "learning_rate": 0.000448995933104179, + "loss": 0.89703441, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.17907715, + "step": 2841, + "time_per_iteration": 2.877012252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091203, + "balance_loss_mlp": 1.07347631, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.06436857909350054, + "language_loss": 0.79967082, + "learning_rate": 0.00044868602585534077, + "loss": 0.81058288, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.17749023, + "step": 2842, + "time_per_iteration": 2.8602800369262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_mlp": 1.06872416, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.07724706520419639, + "language_loss": 0.88682342, + "learning_rate": 0.0004483761385283541, + "loss": 0.89768517, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.17468262, + "step": 2843, + "time_per_iteration": 2.613612413406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083541, + "balance_loss_mlp": 1.06613624, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.07006219963607276, + "language_loss": 0.81547797, + "learning_rate": 0.0004480662712435281, + "loss": 0.82631338, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.17419434, + "step": 2844, + "time_per_iteration": 2.754683256149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084106, + "balance_loss_mlp": 1.0670594, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.0733295738661856, + "language_loss": 0.88330519, + "learning_rate": 0.0004477564241211635, + "loss": 0.89414632, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.1706543, + "step": 2845, + "time_per_iteration": 2.6289172172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_mlp": 1.06219196, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.07864053458548881, + "language_loss": 0.8673318, + "learning_rate": 0.0004474465972815541, + "loss": 0.87812233, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.16870117, + "step": 2846, + "time_per_iteration": 2.560227870941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082496, + "balance_loss_mlp": 1.06498456, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.07175771823025028, + "language_loss": 0.87547499, + "learning_rate": 0.000447136790844985, + "loss": 0.88629997, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.17529297, + "step": 2847, + "time_per_iteration": 2.677354574203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084037, + "balance_loss_mlp": 1.0662632, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.07349860951266184, + "language_loss": 0.80877674, + "learning_rate": 0.00044682700493173385, + "loss": 0.81961715, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.17785645, + "step": 2848, + "time_per_iteration": 2.8295233249664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085345, + "balance_loss_mlp": 1.06835747, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.14023883156705388, + "language_loss": 0.80396128, + "learning_rate": 0.00044651723966207004, + "loss": 0.81481469, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.17004395, + "step": 2849, + "time_per_iteration": 3.1462562084198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108891, + "balance_loss_mlp": 1.07174444, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.07606363506125788, + "language_loss": 0.78336805, + "learning_rate": 0.00044620749515625536, + "loss": 0.79425722, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.17163086, + "step": 2850, + "time_per_iteration": 2.7834317684173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010911, + "balance_loss_mlp": 1.07376719, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.06852456667367239, + "language_loss": 0.84954178, + "learning_rate": 0.00044589777153454334, + "loss": 0.86045277, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.17346191, + "step": 2851, + "time_per_iteration": 2.760814666748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093157, + "balance_loss_mlp": 1.076015, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.07096393350950583, + "language_loss": 0.83673847, + "learning_rate": 0.00044558806891717895, + "loss": 0.84767002, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.17163086, + "step": 2852, + "time_per_iteration": 2.5164217948913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.08369744, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.07126320694951607, + "language_loss": 0.79487526, + "learning_rate": 0.0004452783874243998, + "loss": 0.80588323, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.17102051, + "step": 2853, + "time_per_iteration": 2.8530960083007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103909, + "balance_loss_mlp": 1.08725584, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.08398495342430926, + "language_loss": 0.84832799, + "learning_rate": 0.00044496872717643475, + "loss": 0.85936707, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.16662598, + "step": 2854, + "time_per_iteration": 2.7308356761932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148414, + "balance_loss_mlp": 1.13902032, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.045162076754917825, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78237706, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.09375, + "step": 2855, + "time_per_iteration": 4.96257209777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110869, + "balance_loss_mlp": 1.0924654, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.08468526373475738, + "language_loss": 0.81551182, + "learning_rate": 0.0004443494708958217, + "loss": 0.8265987, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.16223145, + "step": 2856, + "time_per_iteration": 3.0704264640808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101313, + "balance_loss_mlp": 1.08494544, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.07044544020385766, + "language_loss": 0.8094157, + "learning_rate": 0.0004440398751035906, + "loss": 0.82042885, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.16369629, + "step": 2857, + "time_per_iteration": 2.971601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089993, + "balance_loss_mlp": 1.07342279, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.09537197244188163, + "language_loss": 0.83738565, + "learning_rate": 0.00044373030103700645, + "loss": 0.84828568, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.16577148, + "step": 2858, + "time_per_iteration": 2.6193714141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082176, + "balance_loss_mlp": 1.06564164, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.080765091719421, + "language_loss": 0.79399335, + "learning_rate": 0.000443420748816257, + "loss": 0.80481505, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.16540527, + "step": 2859, + "time_per_iteration": 2.8064911365509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080106, + "balance_loss_mlp": 1.06258249, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.073148777328263, + "language_loss": 0.78411651, + "learning_rate": 0.0004431112185615208, + "loss": 0.79491758, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.17541504, + "step": 2860, + "time_per_iteration": 2.8055756092071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075442, + "balance_loss_mlp": 1.05794191, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.07383159181316334, + "language_loss": 0.80081785, + "learning_rate": 0.00044280171039296845, + "loss": 0.81157225, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.17504883, + "step": 2861, + "time_per_iteration": 2.643036127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107612, + "balance_loss_mlp": 1.05894184, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.07661018407476591, + "language_loss": 0.88472402, + "learning_rate": 0.0004424922244307616, + "loss": 0.89548522, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.171875, + "step": 2862, + "time_per_iteration": 2.735457181930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071011, + "balance_loss_mlp": 1.05303383, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.07542764443639904, + "language_loss": 0.82038581, + "learning_rate": 0.00044218276079505315, + "loss": 0.83109593, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.17980957, + "step": 2863, + "time_per_iteration": 2.8912277221679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074407, + "balance_loss_mlp": 1.05706251, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.07733612279333801, + "language_loss": 0.74451876, + "learning_rate": 0.0004418733196059876, + "loss": 0.75526285, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.17358398, + "step": 2864, + "time_per_iteration": 2.7250518798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072289, + "balance_loss_mlp": 1.0549556, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.07639087544106095, + "language_loss": 0.79757476, + "learning_rate": 0.0004415639009837008, + "loss": 0.80829769, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.17358398, + "step": 2865, + "time_per_iteration": 2.864443302154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080254, + "balance_loss_mlp": 1.06293249, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.10225669356006223, + "language_loss": 0.81241995, + "learning_rate": 0.00044125450504831955, + "loss": 0.82322252, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.17346191, + "step": 2866, + "time_per_iteration": 2.757418394088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106921, + "balance_loss_mlp": 1.05211556, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.07466053084799135, + "language_loss": 0.82329029, + "learning_rate": 0.0004409451319199622, + "loss": 0.83398235, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.17102051, + "step": 2867, + "time_per_iteration": 2.6991469860076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076691, + "balance_loss_mlp": 1.05928612, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07186936074556817, + "language_loss": 0.84288383, + "learning_rate": 0.0004406357817187381, + "loss": 0.85365069, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.17419434, + "step": 2868, + "time_per_iteration": 3.0115489959716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080563, + "balance_loss_mlp": 1.06333685, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.0781084398751081, + "language_loss": 0.81316972, + "learning_rate": 0.0004403264545647474, + "loss": 0.82397532, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.17224121, + "step": 2869, + "time_per_iteration": 3.5515377521514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076801, + "balance_loss_mlp": 1.05957544, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.2476039521732135, + "language_loss": 0.84535432, + "learning_rate": 0.00044001715057808154, + "loss": 0.85612237, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.17236328, + "step": 2870, + "time_per_iteration": 2.784949541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081188, + "balance_loss_mlp": 1.06391478, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.06269874774360217, + "language_loss": 0.81665605, + "learning_rate": 0.0004397078698788232, + "loss": 0.82746798, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.17285156, + "step": 2871, + "time_per_iteration": 3.2355031967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_mlp": 1.02401352, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.01828848292268018, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81475484, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.09130859, + "step": 2872, + "time_per_iteration": 4.935345888137817 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102138, + "balance_loss_mlp": 1.08499527, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.07166089349392388, + "language_loss": 0.77967119, + "learning_rate": 0.00043908937882281343, + "loss": 0.79069257, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.17150879, + "step": 2873, + "time_per_iteration": 2.6478757858276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109525, + "balance_loss_mlp": 1.0917629, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.0876696984943119, + "language_loss": 0.8235116, + "learning_rate": 0.0004387801687061814, + "loss": 0.83460689, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.17773438, + "step": 2874, + "time_per_iteration": 2.8796098232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117288, + "balance_loss_mlp": 1.09996676, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.10934470386726207, + "language_loss": 0.80325609, + "learning_rate": 0.0004384709823571958, + "loss": 0.81442899, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.17321777, + "step": 2875, + "time_per_iteration": 2.7749507427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116347, + "balance_loss_mlp": 1.09927666, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.09489557943610515, + "language_loss": 0.82828677, + "learning_rate": 0.0004381618198958932, + "loss": 0.83945024, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.17089844, + "step": 2876, + "time_per_iteration": 3.550828218460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113394, + "balance_loss_mlp": 1.09662116, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.0896519056563172, + "language_loss": 0.83453453, + "learning_rate": 0.00043785268144230137, + "loss": 0.84566844, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.16784668, + "step": 2877, + "time_per_iteration": 2.934293270111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100462, + "balance_loss_mlp": 1.08360553, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.09194081720705921, + "language_loss": 0.8212803, + "learning_rate": 0.00043754356711643837, + "loss": 0.83228499, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.16870117, + "step": 2878, + "time_per_iteration": 2.7139456272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100534, + "balance_loss_mlp": 1.08367825, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.06610172637947556, + "language_loss": 0.83962673, + "learning_rate": 0.0004372344770383132, + "loss": 0.85063207, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.16870117, + "step": 2879, + "time_per_iteration": 2.848620891571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093635, + "balance_loss_mlp": 1.07679105, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.058036155609321634, + "language_loss": 0.82615423, + "learning_rate": 0.00043692541132792507, + "loss": 0.83709061, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.1685791, + "step": 2880, + "time_per_iteration": 2.713151693344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091805, + "balance_loss_mlp": 1.07453132, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.07516039196528058, + "language_loss": 0.83473843, + "learning_rate": 0.00043661637010526384, + "loss": 0.84565651, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.17285156, + "step": 2881, + "time_per_iteration": 2.500458240509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109005, + "balance_loss_mlp": 1.07309878, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.06896643795770978, + "language_loss": 0.83134168, + "learning_rate": 0.00043630735349031025, + "loss": 0.84224218, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.16967773, + "step": 2882, + "time_per_iteration": 2.7521300315856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.07317972, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.0736705000466592, + "language_loss": 0.81719375, + "learning_rate": 0.00043599836160303495, + "loss": 0.82809222, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.16674805, + "step": 2883, + "time_per_iteration": 2.927696704864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092625, + "balance_loss_mlp": 1.07550669, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.07830589066561539, + "language_loss": 0.77380168, + "learning_rate": 0.0004356893945633995, + "loss": 0.78472787, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.17126465, + "step": 2884, + "time_per_iteration": 2.9854161739349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.07886314, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.06846026312584631, + "language_loss": 0.81705189, + "learning_rate": 0.0004353804524913551, + "loss": 0.82800889, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.1685791, + "step": 2885, + "time_per_iteration": 2.6230812072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109452, + "balance_loss_mlp": 1.07769918, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.07648898628472602, + "language_loss": 0.81513786, + "learning_rate": 0.0004350715355068441, + "loss": 0.82608306, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.16821289, + "step": 2886, + "time_per_iteration": 2.7672505378723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088661, + "balance_loss_mlp": 1.07191217, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.09976401172783889, + "language_loss": 0.79409927, + "learning_rate": 0.00043476264372979847, + "loss": 0.80498588, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.16760254, + "step": 2887, + "time_per_iteration": 2.5482900142669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108678, + "balance_loss_mlp": 1.07004309, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.07823105816490118, + "language_loss": 0.78681719, + "learning_rate": 0.0004344537772801408, + "loss": 0.79768503, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.16748047, + "step": 2888, + "time_per_iteration": 3.8460328578948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021438, + "balance_loss_mlp": 1.01290298, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.01755933384686064, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74443889, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.08544922, + "step": 2889, + "time_per_iteration": 4.991191625595093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090362, + "balance_loss_mlp": 1.07311237, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.07150457401269486, + "language_loss": 0.83297288, + "learning_rate": 0.0004338361208426298, + "loss": 0.84387648, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.17272949, + "step": 2890, + "time_per_iteration": 2.6730411052703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108659, + "balance_loss_mlp": 1.06942344, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.07268648775014128, + "language_loss": 0.81282032, + "learning_rate": 0.00043352733109457164, + "loss": 0.82368624, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.17175293, + "step": 2891, + "time_per_iteration": 2.9306631088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094106, + "balance_loss_mlp": 1.07713079, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.057117910972540105, + "language_loss": 0.8439607, + "learning_rate": 0.00043321856715349244, + "loss": 0.85490179, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.1697998, + "step": 2892, + "time_per_iteration": 2.9671812057495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089069, + "balance_loss_mlp": 1.07197452, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.07676329529256688, + "language_loss": 0.80519265, + "learning_rate": 0.00043290982913926466, + "loss": 0.81608331, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.17089844, + "step": 2893, + "time_per_iteration": 2.853346347808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095237, + "balance_loss_mlp": 1.07807112, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.07854184605893377, + "language_loss": 0.84350514, + "learning_rate": 0.0004326011171717514, + "loss": 0.8544575, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.171875, + "step": 2894, + "time_per_iteration": 2.899630546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090371, + "balance_loss_mlp": 1.07324028, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.0742839839754536, + "language_loss": 0.80647063, + "learning_rate": 0.0004322924313708051, + "loss": 0.81737435, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.17138672, + "step": 2895, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094314, + "balance_loss_mlp": 1.07758927, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.09937187753239417, + "language_loss": 0.8452369, + "learning_rate": 0.0004319837718562681, + "loss": 0.85618007, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.1673584, + "step": 2896, + "time_per_iteration": 2.655710220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079176, + "balance_loss_mlp": 1.06149721, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.08562998531843592, + "language_loss": 0.83042324, + "learning_rate": 0.0004316751387479726, + "loss": 0.84121501, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.17700195, + "step": 2897, + "time_per_iteration": 2.7913060188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087861, + "balance_loss_mlp": 1.07069528, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.0783746969742657, + "language_loss": 0.82070696, + "learning_rate": 0.0004313665321657409, + "loss": 0.83158553, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.17175293, + "step": 2898, + "time_per_iteration": 3.726264476776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.06881404, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.0851867501114316, + "language_loss": 0.79751718, + "learning_rate": 0.00043105795222938436, + "loss": 0.80837852, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.17346191, + "step": 2899, + "time_per_iteration": 2.7452197074890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079222, + "balance_loss_mlp": 1.06218684, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.07553101492130006, + "language_loss": 0.78055334, + "learning_rate": 0.00043074939905870467, + "loss": 0.7913456, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.17053223, + "step": 2900, + "time_per_iteration": 2.6780247688293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107635, + "balance_loss_mlp": 1.05935049, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.07503151839740589, + "language_loss": 0.80663788, + "learning_rate": 0.0004304408727734927, + "loss": 0.81740135, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.17016602, + "step": 2901, + "time_per_iteration": 2.7029857635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073519, + "balance_loss_mlp": 1.05609071, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.07321045917693372, + "language_loss": 0.88611877, + "learning_rate": 0.0004301323734935288, + "loss": 0.89685392, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.17443848, + "step": 2902, + "time_per_iteration": 2.679443597793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107071, + "balance_loss_mlp": 1.05356789, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.07694594545228804, + "language_loss": 0.8710258, + "learning_rate": 0.000429823901338583, + "loss": 0.88173282, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.17150879, + "step": 2903, + "time_per_iteration": 2.627321720123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069288, + "balance_loss_mlp": 1.05181181, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.06625834371738154, + "language_loss": 0.8649714, + "learning_rate": 0.00042951545642841513, + "loss": 0.87566429, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.17492676, + "step": 2904, + "time_per_iteration": 3.0950725078582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079393, + "balance_loss_mlp": 1.06204844, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.06552893866180562, + "language_loss": 0.86677754, + "learning_rate": 0.0004292070388827737, + "loss": 0.87757146, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.17358398, + "step": 2905, + "time_per_iteration": 2.6045844554901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079778, + "balance_loss_mlp": 1.0621829, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06250610211350227, + "language_loss": 0.81015515, + "learning_rate": 0.00042889864882139753, + "loss": 0.82095295, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.17602539, + "step": 2906, + "time_per_iteration": 2.5961766242980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089486, + "balance_loss_mlp": 1.07233191, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06934465100856418, + "language_loss": 0.81378168, + "learning_rate": 0.0004285902863640139, + "loss": 0.82467651, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.17175293, + "step": 2907, + "time_per_iteration": 2.6232824325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085922, + "balance_loss_mlp": 1.06869626, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.10268967312822828, + "language_loss": 0.86113304, + "learning_rate": 0.00042828195163033966, + "loss": 0.87199223, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.17236328, + "step": 2908, + "time_per_iteration": 2.696558952331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099626, + "balance_loss_mlp": 1.08187604, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07292872799420033, + "language_loss": 0.78787363, + "learning_rate": 0.0004279736447400812, + "loss": 0.79886991, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.1776123, + "step": 2909, + "time_per_iteration": 2.5506749153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097418, + "balance_loss_mlp": 1.08000195, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.08183440800263254, + "language_loss": 0.78410208, + "learning_rate": 0.00042766536581293385, + "loss": 0.79507631, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.17431641, + "step": 2910, + "time_per_iteration": 2.762291193008423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107558, + "balance_loss_mlp": 1.09001017, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.07156517368649688, + "language_loss": 0.79594785, + "learning_rate": 0.0004273571149685819, + "loss": 0.80702341, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.17541504, + "step": 2911, + "time_per_iteration": 2.8065130710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106568, + "balance_loss_mlp": 1.08937764, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.09303022295818829, + "language_loss": 0.83760977, + "learning_rate": 0.00042704889232669937, + "loss": 0.84867543, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.17199707, + "step": 2912, + "time_per_iteration": 2.7454051971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107264, + "balance_loss_mlp": 1.09049106, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.08686899917243208, + "language_loss": 0.85566956, + "learning_rate": 0.0004267406980069484, + "loss": 0.86674225, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.16772461, + "step": 2913, + "time_per_iteration": 2.703652858734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100261, + "balance_loss_mlp": 1.08297539, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.07169329099349257, + "language_loss": 0.79587048, + "learning_rate": 0.0004264325321289808, + "loss": 0.80687308, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.17297363, + "step": 2914, + "time_per_iteration": 2.8367066383361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100801, + "balance_loss_mlp": 1.08408761, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.08752271404037346, + "language_loss": 0.85925829, + "learning_rate": 0.00042612439481243736, + "loss": 0.87026626, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.16711426, + "step": 2915, + "time_per_iteration": 2.801067590713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102823, + "balance_loss_mlp": 1.08577609, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.08075626027224062, + "language_loss": 0.89818108, + "learning_rate": 0.00042581628617694735, + "loss": 0.90920925, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.1706543, + "step": 2916, + "time_per_iteration": 2.75644588470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101283, + "balance_loss_mlp": 1.08478427, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.09688272488525364, + "language_loss": 0.82010305, + "learning_rate": 0.0004255082063421296, + "loss": 0.83111584, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.16503906, + "step": 2917, + "time_per_iteration": 2.7048747539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101411, + "balance_loss_mlp": 1.08411336, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.05911652799286667, + "language_loss": 0.84559923, + "learning_rate": 0.00042520015542759065, + "loss": 0.8566134, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.17297363, + "step": 2918, + "time_per_iteration": 2.8888731002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096781, + "balance_loss_mlp": 1.0798173, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.0855416495861322, + "language_loss": 0.87984401, + "learning_rate": 0.00042489213355292687, + "loss": 0.8908118, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.1697998, + "step": 2919, + "time_per_iteration": 2.9039535522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099415, + "balance_loss_mlp": 1.08183169, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.09901142655299539, + "language_loss": 0.80785292, + "learning_rate": 0.00042458414083772276, + "loss": 0.81884712, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.17590332, + "step": 2920, + "time_per_iteration": 2.55914306640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100735, + "balance_loss_mlp": 1.08350968, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.058059763768477664, + "language_loss": 0.84851801, + "learning_rate": 0.000424276177401552, + "loss": 0.85952532, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.17248535, + "step": 2921, + "time_per_iteration": 2.847381353378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090657, + "balance_loss_mlp": 1.07289529, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.08698061874066902, + "language_loss": 0.85584521, + "learning_rate": 0.0004239682433639763, + "loss": 0.86675179, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.17785645, + "step": 2922, + "time_per_iteration": 2.707058906555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095936, + "balance_loss_mlp": 1.07888877, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.07977820706870507, + "language_loss": 0.85277724, + "learning_rate": 0.0004236603388445467, + "loss": 0.86373651, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.1706543, + "step": 2923, + "time_per_iteration": 2.6301956176757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090666, + "balance_loss_mlp": 1.07373846, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.07720818022124956, + "language_loss": 0.81903416, + "learning_rate": 0.00042335246396280166, + "loss": 0.8299408, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.16943359, + "step": 2924, + "time_per_iteration": 2.834073066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090909, + "balance_loss_mlp": 1.07374263, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.07626854299399176, + "language_loss": 0.9026264, + "learning_rate": 0.0004230446188382693, + "loss": 0.91353548, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.171875, + "step": 2925, + "time_per_iteration": 2.6027684211730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092312, + "balance_loss_mlp": 1.07481217, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.06785040334520868, + "language_loss": 0.80436468, + "learning_rate": 0.0004227368035904654, + "loss": 0.81528783, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.17504883, + "step": 2926, + "time_per_iteration": 3.005417585372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097122, + "balance_loss_mlp": 1.0790019, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.06983498391207757, + "language_loss": 0.82735908, + "learning_rate": 0.00042242901833889474, + "loss": 0.83833027, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.18139648, + "step": 2927, + "time_per_iteration": 2.6397151947021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090818, + "balance_loss_mlp": 1.07340133, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.08127979757153865, + "language_loss": 0.85876542, + "learning_rate": 0.0004221212632030501, + "loss": 0.86967361, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.17443848, + "step": 2928, + "time_per_iteration": 3.098761558532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098859, + "balance_loss_mlp": 1.08115637, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.07359943981906872, + "language_loss": 0.80209559, + "learning_rate": 0.0004218135383024124, + "loss": 0.81308413, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.17724609, + "step": 2929, + "time_per_iteration": 2.7450544834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087151, + "balance_loss_mlp": 1.06923413, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.08357226339131614, + "language_loss": 0.85142308, + "learning_rate": 0.0004215058437564511, + "loss": 0.86229455, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.17919922, + "step": 2930, + "time_per_iteration": 2.592543125152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083944, + "balance_loss_mlp": 1.06644368, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.14879002546575693, + "language_loss": 0.82019955, + "learning_rate": 0.00042119817968462397, + "loss": 0.83103901, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.17504883, + "step": 2931, + "time_per_iteration": 2.645047187805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080791, + "balance_loss_mlp": 1.06259942, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.08065967807891394, + "language_loss": 0.86642003, + "learning_rate": 0.0004208905462063766, + "loss": 0.87722796, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.18200684, + "step": 2932, + "time_per_iteration": 2.6538538932800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108164, + "balance_loss_mlp": 1.06381869, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.07678540437917139, + "language_loss": 0.84284365, + "learning_rate": 0.00042058294344114315, + "loss": 0.85366011, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.17834473, + "step": 2933, + "time_per_iteration": 2.658790349960327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088075, + "balance_loss_mlp": 1.07069397, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.06842628935517767, + "language_loss": 0.77464747, + "learning_rate": 0.0004202753715083456, + "loss": 0.78552824, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.1739502, + "step": 2934, + "time_per_iteration": 3.0965383052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084539, + "balance_loss_mlp": 1.06742072, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.07525134320826762, + "language_loss": 0.80874884, + "learning_rate": 0.0004199678305273936, + "loss": 0.81959426, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.17126465, + "step": 2935, + "time_per_iteration": 2.6553165912628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097701, + "balance_loss_mlp": 1.08022487, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.06441901520709055, + "language_loss": 0.81395012, + "learning_rate": 0.0004196603206176854, + "loss": 0.82492715, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.17492676, + "step": 2936, + "time_per_iteration": 2.983830213546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087212, + "balance_loss_mlp": 1.07004595, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.07452375479830534, + "language_loss": 0.83586991, + "learning_rate": 0.000419352841898607, + "loss": 0.84674203, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.171875, + "step": 2937, + "time_per_iteration": 3.003563404083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089681, + "balance_loss_mlp": 1.07318234, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.07366437466259683, + "language_loss": 0.76944578, + "learning_rate": 0.000419045394489532, + "loss": 0.78034258, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.16503906, + "step": 2938, + "time_per_iteration": 2.6973941326141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089785, + "balance_loss_mlp": 1.07220173, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.09626894788078913, + "language_loss": 0.76665318, + "learning_rate": 0.0004187379785098224, + "loss": 0.77755105, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.17602539, + "step": 2939, + "time_per_iteration": 3.165407657623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089382, + "balance_loss_mlp": 1.07268023, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.07214080103004945, + "language_loss": 0.83462155, + "learning_rate": 0.00041843059407882744, + "loss": 0.84551537, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.16711426, + "step": 2940, + "time_per_iteration": 2.9633572101593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086882, + "balance_loss_mlp": 1.06998992, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.07122107277750783, + "language_loss": 0.8230179, + "learning_rate": 0.0004181232413158842, + "loss": 0.83388674, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.16906738, + "step": 2941, + "time_per_iteration": 2.6848304271698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091116, + "balance_loss_mlp": 1.07422447, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.08263268782748946, + "language_loss": 0.82281923, + "learning_rate": 0.0004178159203403179, + "loss": 0.83373046, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.16906738, + "step": 2942, + "time_per_iteration": 2.84724760055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090202, + "balance_loss_mlp": 1.07366729, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.06696308597668005, + "language_loss": 0.81382257, + "learning_rate": 0.0004175086312714409, + "loss": 0.82472456, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.16540527, + "step": 2943, + "time_per_iteration": 2.582885265350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092361, + "balance_loss_mlp": 1.0759573, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.060450118167724956, + "language_loss": 0.83769757, + "learning_rate": 0.00041720137422855366, + "loss": 0.84862119, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.1640625, + "step": 2944, + "time_per_iteration": 2.771480083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095642, + "balance_loss_mlp": 1.0798583, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.26231884968371866, + "language_loss": 0.7874673, + "learning_rate": 0.00041689414933094383, + "loss": 0.79842371, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.15771484, + "step": 2945, + "time_per_iteration": 2.6965370178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_mlp": 1.08027291, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.08450400231002299, + "language_loss": 0.81155264, + "learning_rate": 0.00041658695669788653, + "loss": 0.82251894, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.16357422, + "step": 2946, + "time_per_iteration": 2.727442741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105563, + "balance_loss_mlp": 1.08905292, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.08705150140664149, + "language_loss": 0.81145883, + "learning_rate": 0.00041627979644864453, + "loss": 0.82251441, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.16516113, + "step": 2947, + "time_per_iteration": 2.8466544151306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112751, + "balance_loss_mlp": 1.0964433, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.062214847979028806, + "language_loss": 0.8092283, + "learning_rate": 0.0004159726687024683, + "loss": 0.82035577, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.16308594, + "step": 2948, + "time_per_iteration": 2.649352788925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118496, + "balance_loss_mlp": 1.10242701, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.09810621328318807, + "language_loss": 0.79565436, + "learning_rate": 0.00041566557357859506, + "loss": 0.80683935, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.16064453, + "step": 2949, + "time_per_iteration": 2.9100704193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128592, + "balance_loss_mlp": 1.11225998, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.08040833195953295, + "language_loss": 0.79227537, + "learning_rate": 0.0004153585111962502, + "loss": 0.80356133, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.16333008, + "step": 2950, + "time_per_iteration": 3.332738161087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.11884952, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.06937214621935889, + "language_loss": 0.84358597, + "learning_rate": 0.0004150514816746453, + "loss": 0.85493875, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.16418457, + "step": 2951, + "time_per_iteration": 2.712589979171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138042, + "balance_loss_mlp": 1.12165022, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.07032847030676616, + "language_loss": 0.85400414, + "learning_rate": 0.0004147444851329802, + "loss": 0.86538458, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.16394043, + "step": 2952, + "time_per_iteration": 2.6828949451446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147692, + "balance_loss_mlp": 1.13107419, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.07370144055460691, + "language_loss": 0.85637259, + "learning_rate": 0.00041443752169044126, + "loss": 0.86784947, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.16625977, + "step": 2953, + "time_per_iteration": 3.0499908924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156702, + "balance_loss_mlp": 1.13983333, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.07840541898783242, + "language_loss": 0.84904528, + "learning_rate": 0.0004141305914662025, + "loss": 0.86061233, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.16882324, + "step": 2954, + "time_per_iteration": 2.732133626937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135414, + "balance_loss_mlp": 1.1186291, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0690175597343332, + "language_loss": 0.80056989, + "learning_rate": 0.0004138236945794246, + "loss": 0.81192404, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.16784668, + "step": 2955, + "time_per_iteration": 2.920898914337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127256, + "balance_loss_mlp": 1.1108526, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.09346989124624208, + "language_loss": 0.83651698, + "learning_rate": 0.00041351683114925576, + "loss": 0.84778959, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.1640625, + "step": 2956, + "time_per_iteration": 3.1179428100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122658, + "balance_loss_mlp": 1.10612392, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.07393250127791023, + "language_loss": 0.86702883, + "learning_rate": 0.0004132100012948308, + "loss": 0.87825537, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.16540527, + "step": 2957, + "time_per_iteration": 2.6336829662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127835, + "balance_loss_mlp": 1.11014426, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.08317259373738083, + "language_loss": 0.84444946, + "learning_rate": 0.00041290320513527145, + "loss": 0.85572779, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.17712402, + "step": 2958, + "time_per_iteration": 2.641665458679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123865, + "balance_loss_mlp": 1.10708022, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.07155108401540258, + "language_loss": 0.8494001, + "learning_rate": 0.0004125964427896867, + "loss": 0.86063874, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.16796875, + "step": 2959, + "time_per_iteration": 2.6707890033721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111469, + "balance_loss_mlp": 1.09486318, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06610188466362152, + "language_loss": 0.79023135, + "learning_rate": 0.0004122897143771723, + "loss": 0.80134606, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.1661377, + "step": 2960, + "time_per_iteration": 2.564518690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113363, + "balance_loss_mlp": 1.09644711, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.06798711275929166, + "language_loss": 0.81482321, + "learning_rate": 0.0004119830200168109, + "loss": 0.82595682, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.16931152, + "step": 2961, + "time_per_iteration": 2.6972579956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119725, + "balance_loss_mlp": 1.10334563, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.08529196588510703, + "language_loss": 0.88292432, + "learning_rate": 0.0004116763598276714, + "loss": 0.89412153, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.16381836, + "step": 2962, + "time_per_iteration": 2.5670664310455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110605, + "balance_loss_mlp": 1.09353447, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.06258641476293567, + "language_loss": 0.80866015, + "learning_rate": 0.00041136973392881017, + "loss": 0.81976616, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.17077637, + "step": 2963, + "time_per_iteration": 2.883714437484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106776, + "balance_loss_mlp": 1.08975244, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.07231503990514958, + "language_loss": 0.81792593, + "learning_rate": 0.00041106314243926983, + "loss": 0.82899374, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.17041016, + "step": 2964, + "time_per_iteration": 2.7783985137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105614, + "balance_loss_mlp": 1.08862686, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.0703519634607743, + "language_loss": 0.87298268, + "learning_rate": 0.0004107565854780798, + "loss": 0.88403881, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.17004395, + "step": 2965, + "time_per_iteration": 2.6647095680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105743, + "balance_loss_mlp": 1.08862448, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.10409226913166654, + "language_loss": 0.81182659, + "learning_rate": 0.000410450063164256, + "loss": 0.82288408, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.17126465, + "step": 2966, + "time_per_iteration": 2.866602659225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104134, + "balance_loss_mlp": 1.08703911, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.07688057786324835, + "language_loss": 0.82004988, + "learning_rate": 0.00041014357561680115, + "loss": 0.83109128, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.17114258, + "step": 2967, + "time_per_iteration": 2.5523133277893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109926, + "balance_loss_mlp": 1.09312987, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.0904159605578498, + "language_loss": 0.86166346, + "learning_rate": 0.0004098371229547039, + "loss": 0.87276274, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.16809082, + "step": 2968, + "time_per_iteration": 2.724207878112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_mlp": 1.022156, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.013041633212772678, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81042308, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.09326172, + "step": 2969, + "time_per_iteration": 4.806856155395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113794, + "balance_loss_mlp": 1.09678328, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.07993701822539574, + "language_loss": 0.80239302, + "learning_rate": 0.00040922432276247107, + "loss": 0.81353092, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.17028809, + "step": 2970, + "time_per_iteration": 2.603079319000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119244, + "balance_loss_mlp": 1.1021136, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.07050688201783964, + "language_loss": 0.84539342, + "learning_rate": 0.0004089179754702457, + "loss": 0.85658586, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.17150879, + "step": 2971, + "time_per_iteration": 2.806685209274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125569, + "balance_loss_mlp": 1.10841513, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.1127525051095751, + "language_loss": 0.79654694, + "learning_rate": 0.00040861166353919843, + "loss": 0.80780256, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.17175293, + "step": 2972, + "time_per_iteration": 2.822960138320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122737, + "balance_loss_mlp": 1.10572612, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.06522156109142956, + "language_loss": 0.81529987, + "learning_rate": 0.00040830538708824983, + "loss": 0.8265273, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.17028809, + "step": 2973, + "time_per_iteration": 2.883183479309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114225, + "balance_loss_mlp": 1.09716594, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.05988777943056807, + "language_loss": 0.81712234, + "learning_rate": 0.000407999146236307, + "loss": 0.82826465, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.17077637, + "step": 2974, + "time_per_iteration": 2.583639144897461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113969, + "balance_loss_mlp": 1.09735084, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.08488733778098946, + "language_loss": 0.83322281, + "learning_rate": 0.0004076929411022634, + "loss": 0.84436244, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.16625977, + "step": 2975, + "time_per_iteration": 2.6634230613708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117906, + "balance_loss_mlp": 1.10096645, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.10471513442043413, + "language_loss": 0.7910713, + "learning_rate": 0.0004073867718049982, + "loss": 0.80225033, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.16955566, + "step": 2976, + "time_per_iteration": 3.101864814758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116871, + "balance_loss_mlp": 1.10026503, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.08664196816998121, + "language_loss": 0.82484782, + "learning_rate": 0.00040708063846337704, + "loss": 0.83601654, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.1661377, + "step": 2977, + "time_per_iteration": 2.7438297271728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106932, + "balance_loss_mlp": 1.08967066, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.07799786255299582, + "language_loss": 0.81199914, + "learning_rate": 0.00040677454119625143, + "loss": 0.8230685, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.17285156, + "step": 2978, + "time_per_iteration": 2.5837550163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095322, + "balance_loss_mlp": 1.07809663, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.1059947946829761, + "language_loss": 0.82621056, + "learning_rate": 0.0004064684801224587, + "loss": 0.83716381, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.17236328, + "step": 2979, + "time_per_iteration": 2.6220715045928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095905, + "balance_loss_mlp": 1.07850003, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.06700215842091113, + "language_loss": 0.80611891, + "learning_rate": 0.00040616245536082224, + "loss": 0.81707793, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.17431641, + "step": 2980, + "time_per_iteration": 2.6067917346954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086913, + "balance_loss_mlp": 1.069556, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.19945027498537377, + "language_loss": 0.81268358, + "learning_rate": 0.00040585646703015165, + "loss": 0.82355273, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.17370605, + "step": 2981, + "time_per_iteration": 2.910644769668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087867, + "balance_loss_mlp": 1.07096314, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.06421268852729406, + "language_loss": 0.78161913, + "learning_rate": 0.0004055505152492419, + "loss": 0.79249781, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.16918945, + "step": 2982, + "time_per_iteration": 2.6653785705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084086, + "balance_loss_mlp": 1.06670547, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.08054865949602324, + "language_loss": 0.73896229, + "learning_rate": 0.00040524460013687425, + "loss": 0.74980319, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.1739502, + "step": 2983, + "time_per_iteration": 2.721282958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090667, + "balance_loss_mlp": 1.07357204, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.08106324915579151, + "language_loss": 0.81038249, + "learning_rate": 0.0004049387218118155, + "loss": 0.82128918, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.17102051, + "step": 2984, + "time_per_iteration": 2.9739558696746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109026, + "balance_loss_mlp": 1.07321286, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07771926917330779, + "language_loss": 0.84678066, + "learning_rate": 0.00040463288039281777, + "loss": 0.85768324, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.1706543, + "step": 2985, + "time_per_iteration": 2.755789279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049819, + "balance_loss_mlp": 1.0396148, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.027186215876947157, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78926235, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.10205078, + "step": 2986, + "time_per_iteration": 5.024104833602905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102309, + "balance_loss_mlp": 1.08496404, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.07406110021904912, + "language_loss": 0.82250667, + "learning_rate": 0.0004040213087479444, + "loss": 0.83352977, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.17346191, + "step": 2987, + "time_per_iteration": 2.954012632369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110505, + "balance_loss_mlp": 1.0885036, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.08213209001088305, + "language_loss": 0.85105377, + "learning_rate": 0.0004037155787595018, + "loss": 0.86210424, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.16552734, + "step": 2988, + "time_per_iteration": 2.596590757369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103556, + "balance_loss_mlp": 1.08671117, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.06658279323725882, + "language_loss": 0.80333447, + "learning_rate": 0.000403409886151987, + "loss": 0.8143701, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.1685791, + "step": 2989, + "time_per_iteration": 2.9190666675567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049496, + "balance_loss_mlp": 1.03948224, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.024963739862010757, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.830486, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.10009766, + "step": 2990, + "time_per_iteration": 4.779403448104858 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_mlp": 1.03442252, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.02279292821926405, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79242849, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.09814453, + "step": 2991, + "time_per_iteration": 4.813813209533691 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104349, + "balance_loss_mlp": 1.08761191, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.07351496217070447, + "language_loss": 0.76526999, + "learning_rate": 0.00040249303380173807, + "loss": 0.77631354, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.16748047, + "step": 2992, + "time_per_iteration": 3.0984480381011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.08323884, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.07106147833910306, + "language_loss": 0.78964388, + "learning_rate": 0.00040218749190459126, + "loss": 0.80064261, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.16638184, + "step": 2993, + "time_per_iteration": 2.7525393962860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109225, + "balance_loss_mlp": 1.07550144, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.07997694276494066, + "language_loss": 0.82424486, + "learning_rate": 0.00040188198798162775, + "loss": 0.83516741, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.16760254, + "step": 2994, + "time_per_iteration": 2.6026856899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105077, + "balance_loss_mlp": 1.08812571, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.060991263028610375, + "language_loss": 0.85548359, + "learning_rate": 0.000401576522151455, + "loss": 0.86653435, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.16955566, + "step": 2995, + "time_per_iteration": 2.8387343883514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097873, + "balance_loss_mlp": 1.08148181, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.0649014718190417, + "language_loss": 0.82459986, + "learning_rate": 0.0004012710945326651, + "loss": 0.83557856, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.16394043, + "step": 2996, + "time_per_iteration": 2.8002259731292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.08355331, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.07884412717722156, + "language_loss": 0.80980134, + "learning_rate": 0.0004009657052438355, + "loss": 0.82079625, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.15930176, + "step": 2997, + "time_per_iteration": 2.8380162715911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106429, + "balance_loss_mlp": 1.09044361, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.09100511136442054, + "language_loss": 0.8548094, + "learning_rate": 0.00040066035440352904, + "loss": 0.86587369, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.15979004, + "step": 2998, + "time_per_iteration": 2.7165040969848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_mlp": 1.04687226, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.029413044868518267, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80347776, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.07763672, + "step": 2999, + "time_per_iteration": 4.891362905502319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105098, + "balance_loss_mlp": 1.08894527, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.08263350927787948, + "language_loss": 0.75637519, + "learning_rate": 0.00040004976854266145, + "loss": 0.76742619, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.16149902, + "step": 3000, + "time_per_iteration": 2.5579755306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105893, + "balance_loss_mlp": 1.08987141, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.06941869769704709, + "language_loss": 0.81322896, + "learning_rate": 0.0003997445337591505, + "loss": 0.82428795, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.16027832, + "step": 3001, + "time_per_iteration": 2.689349889755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104956, + "balance_loss_mlp": 1.0884937, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.09192868754767076, + "language_loss": 0.74184531, + "learning_rate": 0.0003994393378982635, + "loss": 0.75289488, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.16467285, + "step": 3002, + "time_per_iteration": 2.6561992168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074164, + "balance_loss_mlp": 1.06658196, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.035051917356449074, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80612171, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.07568359, + "step": 3003, + "time_per_iteration": 4.835859298706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101477, + "balance_loss_mlp": 1.0852406, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.07939797508674061, + "language_loss": 0.8815853, + "learning_rate": 0.0003988290634182961, + "loss": 0.89260006, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.16235352, + "step": 3004, + "time_per_iteration": 2.8315813541412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106342, + "balance_loss_mlp": 1.09034419, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.07086440080231367, + "language_loss": 0.80762905, + "learning_rate": 0.0003985239850361453, + "loss": 0.81869251, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.15991211, + "step": 3005, + "time_per_iteration": 2.6647462844848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.08430243, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.07031230145466298, + "language_loss": 0.84713155, + "learning_rate": 0.0003982189460504777, + "loss": 0.85813624, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.16162109, + "step": 3006, + "time_per_iteration": 2.70588755607605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104818, + "balance_loss_mlp": 1.08837891, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.07782537057878013, + "language_loss": 0.78822792, + "learning_rate": 0.00039791394657971935, + "loss": 0.79927599, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.16442871, + "step": 3007, + "time_per_iteration": 2.7525734901428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112062, + "balance_loss_mlp": 1.09575403, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.08023947055085524, + "language_loss": 0.84335512, + "learning_rate": 0.00039760898674228205, + "loss": 0.85447574, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.16308594, + "step": 3008, + "time_per_iteration": 2.6740429401397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.08913136, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.06481055961735596, + "language_loss": 0.80689526, + "learning_rate": 0.0003973040666565613, + "loss": 0.81794715, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.16052246, + "step": 3009, + "time_per_iteration": 3.0985798835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105331, + "balance_loss_mlp": 1.08880866, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.07104717657711816, + "language_loss": 0.8190769, + "learning_rate": 0.000396999186440938, + "loss": 0.83013022, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.1652832, + "step": 3010, + "time_per_iteration": 2.8631935119628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095613, + "balance_loss_mlp": 1.07888842, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.07539914783858101, + "language_loss": 0.85185289, + "learning_rate": 0.000396694346213777, + "loss": 0.86280894, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.1673584, + "step": 3011, + "time_per_iteration": 2.7040622234344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093449, + "balance_loss_mlp": 1.0765686, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.06256207841015303, + "language_loss": 0.83364058, + "learning_rate": 0.0003963895460934276, + "loss": 0.84457505, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.16882324, + "step": 3012, + "time_per_iteration": 3.173614025115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.07312369, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.08299946451997237, + "language_loss": 0.85058802, + "learning_rate": 0.00039608478619822376, + "loss": 0.86148685, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.16772461, + "step": 3013, + "time_per_iteration": 2.4611692428588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.065166, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.06639451681987794, + "language_loss": 0.82375103, + "learning_rate": 0.00039578006664648394, + "loss": 0.83457041, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.16784668, + "step": 3014, + "time_per_iteration": 2.789212703704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085955, + "balance_loss_mlp": 1.06965923, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.08034627380925646, + "language_loss": 0.81074166, + "learning_rate": 0.0003954753875565105, + "loss": 0.82160121, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.16296387, + "step": 3015, + "time_per_iteration": 3.1160459518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082316, + "balance_loss_mlp": 1.06503117, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.06677664636320767, + "language_loss": 0.82464337, + "learning_rate": 0.00039517074904659057, + "loss": 0.83546656, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.1730957, + "step": 3016, + "time_per_iteration": 2.716564655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087588, + "balance_loss_mlp": 1.07085133, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.0799627957481028, + "language_loss": 0.84913206, + "learning_rate": 0.00039486615123499535, + "loss": 0.86000794, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.16748047, + "step": 3017, + "time_per_iteration": 2.855402708053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079434, + "balance_loss_mlp": 1.06237507, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.08435209251616928, + "language_loss": 0.85015523, + "learning_rate": 0.00039456159423997996, + "loss": 0.86094958, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.17077637, + "step": 3018, + "time_per_iteration": 2.6843197345733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.06261373, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.06274674533021377, + "language_loss": 0.89687812, + "learning_rate": 0.00039425707817978406, + "loss": 0.90767419, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.16992188, + "step": 3019, + "time_per_iteration": 2.681183099746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076495, + "balance_loss_mlp": 1.05895901, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.14184929094941942, + "language_loss": 0.83556581, + "learning_rate": 0.00039395260317263124, + "loss": 0.84633076, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.17553711, + "step": 3020, + "time_per_iteration": 2.629709482192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073542, + "balance_loss_mlp": 1.05577993, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.08203162266100236, + "language_loss": 0.84840143, + "learning_rate": 0.0003936481693367291, + "loss": 0.85913682, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.1776123, + "step": 3021, + "time_per_iteration": 2.717710018157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083798, + "balance_loss_mlp": 1.06607115, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08298145922497896, + "language_loss": 0.87323809, + "learning_rate": 0.0003933437767902697, + "loss": 0.88407612, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.17749023, + "step": 3022, + "time_per_iteration": 2.8179917335510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.07563782, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.07663513037653054, + "language_loss": 0.77978808, + "learning_rate": 0.00039303942565142825, + "loss": 0.79071838, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.17407227, + "step": 3023, + "time_per_iteration": 2.7656824588775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092602, + "balance_loss_mlp": 1.07522154, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.09353579288790682, + "language_loss": 0.76389718, + "learning_rate": 0.0003927351160383644, + "loss": 0.77482319, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.1739502, + "step": 3024, + "time_per_iteration": 2.81196665763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096766, + "balance_loss_mlp": 1.07968342, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.05988996320852443, + "language_loss": 0.77658468, + "learning_rate": 0.000392430848069222, + "loss": 0.78755236, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.17089844, + "step": 3025, + "time_per_iteration": 2.553349733352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.07864261, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.09842162601860249, + "language_loss": 0.82432085, + "learning_rate": 0.00039212662186212795, + "loss": 0.83527917, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.17199707, + "step": 3026, + "time_per_iteration": 2.6321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096184, + "balance_loss_mlp": 1.07874346, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.06216962714468932, + "language_loss": 0.77065325, + "learning_rate": 0.0003918224375351934, + "loss": 0.78161508, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.17468262, + "step": 3027, + "time_per_iteration": 2.7319040298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_mlp": 1.08531559, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.06463813423056745, + "language_loss": 0.78389823, + "learning_rate": 0.0003915182952065135, + "loss": 0.79492265, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.17138672, + "step": 3028, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.08095205, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.07943165793883354, + "language_loss": 0.87551522, + "learning_rate": 0.0003912141949941664, + "loss": 0.8864941, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.16955566, + "step": 3029, + "time_per_iteration": 2.7122318744659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091171, + "balance_loss_mlp": 1.07376611, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.08419707099866325, + "language_loss": 0.82715654, + "learning_rate": 0.0003909101370162143, + "loss": 0.83806825, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.17431641, + "step": 3030, + "time_per_iteration": 2.6301612854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010224, + "balance_loss_mlp": 1.00211763, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.006956762065680846, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73444116, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.08105469, + "step": 3031, + "time_per_iteration": 4.870691299438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091064, + "balance_loss_mlp": 1.07400537, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.08204338633061625, + "language_loss": 0.82931381, + "learning_rate": 0.0003903021482356622, + "loss": 0.8402245, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.1706543, + "step": 3032, + "time_per_iteration": 2.829430103302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091732, + "balance_loss_mlp": 1.07503033, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.08520682753706012, + "language_loss": 0.82501173, + "learning_rate": 0.00038999821766910465, + "loss": 0.8359291, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.16711426, + "step": 3033, + "time_per_iteration": 3.0449070930480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087325, + "balance_loss_mlp": 1.07023025, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.07138585009560579, + "language_loss": 0.85493183, + "learning_rate": 0.00038969432980902606, + "loss": 0.86580509, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.17114258, + "step": 3034, + "time_per_iteration": 2.6099114418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015774, + "balance_loss_mlp": 1.00771523, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.011956814182891856, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80800277, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.08056641, + "step": 3035, + "time_per_iteration": 4.919405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.0678978, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.0762930329312805, + "language_loss": 0.82252562, + "learning_rate": 0.00038908668268020953, + "loss": 0.83336914, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.16455078, + "step": 3036, + "time_per_iteration": 2.7005980014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.06603003, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.07750025430989764, + "language_loss": 0.84744304, + "learning_rate": 0.00038878292364738097, + "loss": 0.85826999, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.16674805, + "step": 3037, + "time_per_iteration": 2.854461908340454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085343, + "balance_loss_mlp": 1.0690949, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.0866866607830145, + "language_loss": 0.86865294, + "learning_rate": 0.0003884792077928508, + "loss": 0.87950635, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.16235352, + "step": 3038, + "time_per_iteration": 2.526219606399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085659, + "balance_loss_mlp": 1.06974506, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.09714525133414084, + "language_loss": 0.76819932, + "learning_rate": 0.0003881755352345322, + "loss": 0.77905595, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.15905762, + "step": 3039, + "time_per_iteration": 2.5546979904174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086917, + "balance_loss_mlp": 1.0702157, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.09749751366402076, + "language_loss": 0.86787152, + "learning_rate": 0.0003878719060903207, + "loss": 0.87874067, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.16711426, + "step": 3040, + "time_per_iteration": 2.585848093032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091729, + "balance_loss_mlp": 1.07531416, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.0840209110893744, + "language_loss": 0.83088207, + "learning_rate": 0.0003875683204780961, + "loss": 0.84179938, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.16418457, + "step": 3041, + "time_per_iteration": 2.7646286487579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096113, + "balance_loss_mlp": 1.08006763, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.08651728983241819, + "language_loss": 0.85210633, + "learning_rate": 0.00038726477851572043, + "loss": 0.86306751, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.16040039, + "step": 3042, + "time_per_iteration": 2.797314167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101767, + "balance_loss_mlp": 1.08557868, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.08316199388994981, + "language_loss": 0.80228806, + "learning_rate": 0.0003869612803210395, + "loss": 0.81330574, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.16186523, + "step": 3043, + "time_per_iteration": 2.6490185260772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103183, + "balance_loss_mlp": 1.08701873, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.06777837645025765, + "language_loss": 0.83051372, + "learning_rate": 0.0003866578260118817, + "loss": 0.84154546, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.16162109, + "step": 3044, + "time_per_iteration": 2.6326801776885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106723, + "balance_loss_mlp": 1.09098744, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.07505807062734855, + "language_loss": 0.83121902, + "learning_rate": 0.0003863544157060581, + "loss": 0.84228623, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.15722656, + "step": 3045, + "time_per_iteration": 2.7122910022735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113369, + "balance_loss_mlp": 1.09763348, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.06825767558676081, + "language_loss": 0.81871521, + "learning_rate": 0.0003860510495213634, + "loss": 0.82984889, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.15722656, + "step": 3046, + "time_per_iteration": 2.8188610076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113296, + "balance_loss_mlp": 1.09753644, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.07680372972712284, + "language_loss": 0.7820521, + "learning_rate": 0.0003857477275755746, + "loss": 0.79318506, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.1574707, + "step": 3047, + "time_per_iteration": 2.680021047592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114239, + "balance_loss_mlp": 1.09859896, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.06132573168351462, + "language_loss": 0.83483028, + "learning_rate": 0.00038544444998645167, + "loss": 0.84597266, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.15625, + "step": 3048, + "time_per_iteration": 3.024035692214966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09482431, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.07774154556799634, + "language_loss": 0.81755519, + "learning_rate": 0.00038514121687173767, + "loss": 0.82866311, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.15966797, + "step": 3049, + "time_per_iteration": 2.602348566055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106413, + "balance_loss_mlp": 1.09079647, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.07288499528915, + "language_loss": 0.81607699, + "learning_rate": 0.00038483802834915807, + "loss": 0.82714111, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.15600586, + "step": 3050, + "time_per_iteration": 3.0202012062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102645, + "balance_loss_mlp": 1.08663559, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.06464020852625685, + "language_loss": 0.78985357, + "learning_rate": 0.00038453488453642074, + "loss": 0.80088001, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.16003418, + "step": 3051, + "time_per_iteration": 2.6733691692352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_mlp": 1.0853616, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.11499584820010532, + "language_loss": 0.86622018, + "learning_rate": 0.00038423178555121697, + "loss": 0.87723207, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.1583252, + "step": 3052, + "time_per_iteration": 2.7339212894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091842, + "balance_loss_mlp": 1.07583237, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.06975664982977658, + "language_loss": 0.85649264, + "learning_rate": 0.00038392873151121994, + "loss": 0.86741114, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.16003418, + "step": 3053, + "time_per_iteration": 3.0498745441436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094641, + "balance_loss_mlp": 1.07848823, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.07594371919491524, + "language_loss": 0.82729709, + "learning_rate": 0.0003836257225340859, + "loss": 0.83824348, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.16149902, + "step": 3054, + "time_per_iteration": 2.6312718391418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083342, + "balance_loss_mlp": 1.0662595, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.07226211151265562, + "language_loss": 0.81785333, + "learning_rate": 0.00038332275873745336, + "loss": 0.82868683, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.17102051, + "step": 3055, + "time_per_iteration": 3.0953447818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086607, + "balance_loss_mlp": 1.06990623, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.05891266503615663, + "language_loss": 0.82779503, + "learning_rate": 0.0003830198402389431, + "loss": 0.83866107, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.16711426, + "step": 3056, + "time_per_iteration": 2.7385828495025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022532, + "balance_loss_mlp": 1.01485538, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.023195211062617696, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78371465, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.07666016, + "step": 3057, + "time_per_iteration": 5.0122692584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082378, + "balance_loss_mlp": 1.06487858, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.09420327310468278, + "language_loss": 0.82856947, + "learning_rate": 0.0003824141396066855, + "loss": 0.83939326, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.17504883, + "step": 3058, + "time_per_iteration": 2.630829334259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086117, + "balance_loss_mlp": 1.06941545, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.07561205741670568, + "language_loss": 0.82764673, + "learning_rate": 0.000382111357708092, + "loss": 0.83850795, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.16711426, + "step": 3059, + "time_per_iteration": 2.754732608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079118, + "balance_loss_mlp": 1.06203532, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.07214212654246877, + "language_loss": 0.83606875, + "learning_rate": 0.00038180862157792864, + "loss": 0.84685993, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.17102051, + "step": 3060, + "time_per_iteration": 2.8452963829040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079154, + "balance_loss_mlp": 1.06195176, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.06766423660124334, + "language_loss": 0.81912309, + "learning_rate": 0.0003815059313337279, + "loss": 0.82991457, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.17224121, + "step": 3061, + "time_per_iteration": 2.699923515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075885, + "balance_loss_mlp": 1.05862319, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.05609969141419105, + "language_loss": 0.78319967, + "learning_rate": 0.00038120328709300436, + "loss": 0.79395854, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.17272949, + "step": 3062, + "time_per_iteration": 2.9140214920043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073381, + "balance_loss_mlp": 1.05580938, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.06388746068798092, + "language_loss": 0.83677167, + "learning_rate": 0.0003809006889732549, + "loss": 0.84750545, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.17590332, + "step": 3063, + "time_per_iteration": 2.812375068664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073036, + "balance_loss_mlp": 1.05551219, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.1840205152254721, + "language_loss": 0.87883544, + "learning_rate": 0.0003805981370919589, + "loss": 0.88956577, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.17529297, + "step": 3064, + "time_per_iteration": 2.5644187927246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073964, + "balance_loss_mlp": 1.05604672, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.08741335688742048, + "language_loss": 0.83813435, + "learning_rate": 0.0003802956315665771, + "loss": 0.84887397, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.17932129, + "step": 3065, + "time_per_iteration": 2.6985597610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077561, + "balance_loss_mlp": 1.0604192, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.09549414349914971, + "language_loss": 0.81565332, + "learning_rate": 0.0003799931725145529, + "loss": 0.82642901, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.17150879, + "step": 3066, + "time_per_iteration": 2.621553897857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.06172466, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.06470265589627064, + "language_loss": 0.85731423, + "learning_rate": 0.00037969076005331083, + "loss": 0.86810863, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.17736816, + "step": 3067, + "time_per_iteration": 2.7705938816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108525, + "balance_loss_mlp": 1.06776178, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.07323535980547291, + "language_loss": 0.87987936, + "learning_rate": 0.00037938839430025817, + "loss": 0.89073181, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.17504883, + "step": 3068, + "time_per_iteration": 2.6688857078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.06792498, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.13096377841439616, + "language_loss": 0.85380679, + "learning_rate": 0.0003790860753727835, + "loss": 0.86466074, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.17492676, + "step": 3069, + "time_per_iteration": 2.9018454551696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091061, + "balance_loss_mlp": 1.07345426, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0726049430242405, + "language_loss": 0.82249814, + "learning_rate": 0.00037878380338825766, + "loss": 0.83340883, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.1763916, + "step": 3070, + "time_per_iteration": 2.695953607559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095711, + "balance_loss_mlp": 1.07847357, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.07160608760806797, + "language_loss": 0.81351429, + "learning_rate": 0.00037848157846403287, + "loss": 0.82447141, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.17248535, + "step": 3071, + "time_per_iteration": 2.900130271911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096895, + "balance_loss_mlp": 1.07976437, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.08831271669304017, + "language_loss": 0.83602202, + "learning_rate": 0.0003781794007174435, + "loss": 0.846991, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.17150879, + "step": 3072, + "time_per_iteration": 2.7315585613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052839, + "balance_loss_mlp": 1.0453527, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.018548344346269084, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75127375, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.07470703, + "step": 3073, + "time_per_iteration": 4.843595027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096296, + "balance_loss_mlp": 1.07984531, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.06605464812454943, + "language_loss": 0.80771315, + "learning_rate": 0.0003775751872264152, + "loss": 0.81867611, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.16455078, + "step": 3074, + "time_per_iteration": 2.812434196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088175, + "balance_loss_mlp": 1.07113981, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.08890011139795934, + "language_loss": 0.86803812, + "learning_rate": 0.0003772731517165527, + "loss": 0.87891984, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.17041016, + "step": 3075, + "time_per_iteration": 2.8199949264526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087435, + "balance_loss_mlp": 1.07135379, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.06956331546073297, + "language_loss": 0.83378977, + "learning_rate": 0.0003769711638534784, + "loss": 0.8446641, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.16064453, + "step": 3076, + "time_per_iteration": 3.021451711654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097469, + "balance_loss_mlp": 1.08068419, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.07608235771804774, + "language_loss": 0.79065943, + "learning_rate": 0.00037666922375443446, + "loss": 0.80163419, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.16796875, + "step": 3077, + "time_per_iteration": 2.602043867111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092109, + "balance_loss_mlp": 1.076123, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.09346086613563626, + "language_loss": 0.81744075, + "learning_rate": 0.00037636733153664396, + "loss": 0.82836187, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.15979004, + "step": 3078, + "time_per_iteration": 2.8222453594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093493, + "balance_loss_mlp": 1.07719743, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.1116363853226753, + "language_loss": 0.79912782, + "learning_rate": 0.0003760654873173124, + "loss": 0.81006277, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.16296387, + "step": 3079, + "time_per_iteration": 2.6946070194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085907, + "balance_loss_mlp": 1.06951547, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.06915984482876121, + "language_loss": 0.81859291, + "learning_rate": 0.00037576369121362566, + "loss": 0.82945192, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.16394043, + "step": 3080, + "time_per_iteration": 2.6502840518951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088263, + "balance_loss_mlp": 1.07191896, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.07693331015944839, + "language_loss": 0.8159368, + "learning_rate": 0.0003754619433427516, + "loss": 0.82681942, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.16345215, + "step": 3081, + "time_per_iteration": 2.9385058879852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084208, + "balance_loss_mlp": 1.06749439, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.07095697248954357, + "language_loss": 0.77517045, + "learning_rate": 0.0003751602438218392, + "loss": 0.78601247, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.16723633, + "step": 3082, + "time_per_iteration": 2.8245561122894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083121, + "balance_loss_mlp": 1.06693244, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.1021077750392874, + "language_loss": 0.83509332, + "learning_rate": 0.0003748585927680186, + "loss": 0.8459245, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.16186523, + "step": 3083, + "time_per_iteration": 2.6818346977233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084171, + "balance_loss_mlp": 1.06721938, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.06846862154983226, + "language_loss": 0.82637662, + "learning_rate": 0.00037455699029840086, + "loss": 0.83721828, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.16967773, + "step": 3084, + "time_per_iteration": 2.6860570907592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088457, + "balance_loss_mlp": 1.07176781, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.06710726384898401, + "language_loss": 0.8462739, + "learning_rate": 0.0003742554365300787, + "loss": 0.85715848, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.16699219, + "step": 3085, + "time_per_iteration": 2.749816656112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088228, + "balance_loss_mlp": 1.07143116, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.08250802724924795, + "language_loss": 0.78595787, + "learning_rate": 0.0003739539315801255, + "loss": 0.79684019, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.16809082, + "step": 3086, + "time_per_iteration": 2.982919216156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092705, + "balance_loss_mlp": 1.07571757, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.083760246794696, + "language_loss": 0.91647482, + "learning_rate": 0.000373652475565596, + "loss": 0.9274019, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.16992188, + "step": 3087, + "time_per_iteration": 2.4816558361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_mlp": 1.08528244, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.09245346089356003, + "language_loss": 0.81352496, + "learning_rate": 0.00037335106860352587, + "loss": 0.82454908, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.17138672, + "step": 3088, + "time_per_iteration": 2.675565719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.07863212, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.10172018328041595, + "language_loss": 0.83090484, + "learning_rate": 0.00037304971081093146, + "loss": 0.84185594, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.16479492, + "step": 3089, + "time_per_iteration": 2.614063024520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102174, + "balance_loss_mlp": 1.08573484, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.09417550180705583, + "language_loss": 0.81048489, + "learning_rate": 0.00037274840230481024, + "loss": 0.82150662, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.16442871, + "step": 3090, + "time_per_iteration": 2.791287899017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106483, + "balance_loss_mlp": 1.09013939, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.08210045649904979, + "language_loss": 0.79059577, + "learning_rate": 0.00037244714320214077, + "loss": 0.80166066, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.16345215, + "step": 3091, + "time_per_iteration": 2.5437703132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101492, + "balance_loss_mlp": 1.08511281, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.06960715408232113, + "language_loss": 0.83210528, + "learning_rate": 0.000372145933619882, + "loss": 0.84312022, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.16381836, + "step": 3092, + "time_per_iteration": 2.902186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112879, + "balance_loss_mlp": 1.0964278, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.11673775861228046, + "language_loss": 0.82268316, + "learning_rate": 0.000371844773674974, + "loss": 0.833812, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.16455078, + "step": 3093, + "time_per_iteration": 2.6614809036254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116363, + "balance_loss_mlp": 1.10023379, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.0944691086002383, + "language_loss": 0.81785637, + "learning_rate": 0.0003715436634843375, + "loss": 0.82902002, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.16125488, + "step": 3094, + "time_per_iteration": 2.90022873878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117363, + "balance_loss_mlp": 1.10172296, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.057224396595454204, + "language_loss": 0.80872512, + "learning_rate": 0.00037124260316487355, + "loss": 0.81989878, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.15625, + "step": 3095, + "time_per_iteration": 2.885049819946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114273, + "balance_loss_mlp": 1.09841847, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.06086987109203959, + "language_loss": 0.89374322, + "learning_rate": 0.0003709415928334643, + "loss": 0.90488601, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.15844727, + "step": 3096, + "time_per_iteration": 2.6082546710968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011177, + "balance_loss_mlp": 1.10172629, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.09348672972793858, + "language_loss": 0.80559552, + "learning_rate": 0.00037064063260697233, + "loss": 0.81677252, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.15966797, + "step": 3097, + "time_per_iteration": 2.8901162147521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123233, + "balance_loss_mlp": 1.10749698, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.06876216438303968, + "language_loss": 0.78693187, + "learning_rate": 0.0003703397226022407, + "loss": 0.79816419, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.15722656, + "step": 3098, + "time_per_iteration": 3.066073179244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102465, + "balance_loss_mlp": 1.09416783, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.03442912107402327, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.7660234, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.08300781, + "step": 3099, + "time_per_iteration": 4.9653050899505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127115, + "balance_loss_mlp": 1.11136746, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.0680420214228425, + "language_loss": 0.8297379, + "learning_rate": 0.0003697380537253339, + "loss": 0.84100908, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.15734863, + "step": 3100, + "time_per_iteration": 2.715177059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.0978117, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.06669871573577384, + "language_loss": 0.81245238, + "learning_rate": 0.0003694372950867471, + "loss": 0.82358712, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.15649414, + "step": 3101, + "time_per_iteration": 2.8005011081695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123061, + "balance_loss_mlp": 1.10731363, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.07790109934746459, + "language_loss": 0.77269602, + "learning_rate": 0.0003691365871370976, + "loss": 0.78392667, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.15734863, + "step": 3102, + "time_per_iteration": 3.077610731124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118239, + "balance_loss_mlp": 1.10267067, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06403529919974375, + "language_loss": 0.85239542, + "learning_rate": 0.00036883592999313093, + "loss": 0.86357784, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.15551758, + "step": 3103, + "time_per_iteration": 2.6910035610198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123035, + "balance_loss_mlp": 1.10726357, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.07439514059918453, + "language_loss": 0.7913959, + "learning_rate": 0.0003685353237715722, + "loss": 0.80262625, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.15759277, + "step": 3104, + "time_per_iteration": 2.8957912921905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118284, + "balance_loss_mlp": 1.10222602, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.09765250688336868, + "language_loss": 0.81377506, + "learning_rate": 0.0003682347685891274, + "loss": 0.82495785, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.16052246, + "step": 3105, + "time_per_iteration": 2.84584379196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106359, + "balance_loss_mlp": 1.09007454, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.07268165375697674, + "language_loss": 0.805511, + "learning_rate": 0.0003679342645624822, + "loss": 0.81657457, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.1628418, + "step": 3106, + "time_per_iteration": 3.0009236335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116176, + "balance_loss_mlp": 1.09978509, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.08276382082752762, + "language_loss": 0.81614435, + "learning_rate": 0.0003676338118083025, + "loss": 0.82730609, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.16394043, + "step": 3107, + "time_per_iteration": 3.088297128677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103194, + "balance_loss_mlp": 1.08736336, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.10722680659176895, + "language_loss": 0.79196644, + "learning_rate": 0.0003673334104432347, + "loss": 0.80299842, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.15820312, + "step": 3108, + "time_per_iteration": 2.634643077850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100064, + "balance_loss_mlp": 1.08379245, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.07294397192010518, + "language_loss": 0.8350544, + "learning_rate": 0.0003670330605839048, + "loss": 0.84605503, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.16271973, + "step": 3109, + "time_per_iteration": 2.8294010162353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091507, + "balance_loss_mlp": 1.0755446, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.08059004302640393, + "language_loss": 0.76664943, + "learning_rate": 0.0003667327623469191, + "loss": 0.77756447, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.1595459, + "step": 3110, + "time_per_iteration": 2.784902334213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100362, + "balance_loss_mlp": 1.084126, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.07319281645054936, + "language_loss": 0.77725756, + "learning_rate": 0.00036643251584886333, + "loss": 0.78826118, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.16235352, + "step": 3111, + "time_per_iteration": 2.795421838760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100904, + "balance_loss_mlp": 1.08444118, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.07234799336755846, + "language_loss": 0.8192088, + "learning_rate": 0.00036613232120630393, + "loss": 0.83021784, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.16467285, + "step": 3112, + "time_per_iteration": 2.6119191646575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07855165, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.1220679262263155, + "language_loss": 0.7997117, + "learning_rate": 0.00036583217853578643, + "loss": 0.81066352, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.16638184, + "step": 3113, + "time_per_iteration": 2.5559191703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095031, + "balance_loss_mlp": 1.07856846, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.06954821000435275, + "language_loss": 0.77413309, + "learning_rate": 0.000365532087953837, + "loss": 0.78508341, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.16467285, + "step": 3114, + "time_per_iteration": 3.6444194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093034, + "balance_loss_mlp": 1.07666647, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.07355388338928669, + "language_loss": 0.89153886, + "learning_rate": 0.00036523204957696065, + "loss": 0.90246928, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.16369629, + "step": 3115, + "time_per_iteration": 2.6114542484283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090631, + "balance_loss_mlp": 1.07385826, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.06661163617003031, + "language_loss": 0.80990088, + "learning_rate": 0.00036493206352164324, + "loss": 0.82080722, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.16784668, + "step": 3116, + "time_per_iteration": 2.977773666381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099589, + "balance_loss_mlp": 1.08299482, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.06605770678363264, + "language_loss": 0.85320091, + "learning_rate": 0.000364632129904349, + "loss": 0.86419678, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.16601562, + "step": 3117, + "time_per_iteration": 2.7504782676696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109875, + "balance_loss_mlp": 1.08246565, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.07896925435607946, + "language_loss": 0.78125691, + "learning_rate": 0.00036433224884152283, + "loss": 0.79224437, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.1628418, + "step": 3118, + "time_per_iteration": 2.762640953063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106506, + "balance_loss_mlp": 1.09019828, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.08654027448722386, + "language_loss": 0.77639025, + "learning_rate": 0.00036403242044958875, + "loss": 0.78745532, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.16308594, + "step": 3119, + "time_per_iteration": 2.590341567993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105329, + "balance_loss_mlp": 1.08873463, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.12490963722323402, + "language_loss": 0.91469646, + "learning_rate": 0.0003637326448449507, + "loss": 0.92574978, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.16601562, + "step": 3120, + "time_per_iteration": 2.757040500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114298, + "balance_loss_mlp": 1.09782338, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.07048281834234121, + "language_loss": 0.85906887, + "learning_rate": 0.00036343292214399177, + "loss": 0.87021184, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.16479492, + "step": 3121, + "time_per_iteration": 2.7731616497039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110866, + "balance_loss_mlp": 1.09368825, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.08856935015061373, + "language_loss": 0.77217454, + "learning_rate": 0.00036313325246307456, + "loss": 0.78328323, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.171875, + "step": 3122, + "time_per_iteration": 2.8254263401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107144, + "balance_loss_mlp": 1.0897516, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.07082824318872671, + "language_loss": 0.87116647, + "learning_rate": 0.0003628336359185411, + "loss": 0.88223791, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.17419434, + "step": 3123, + "time_per_iteration": 2.6960785388946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104705, + "balance_loss_mlp": 1.08815873, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.09352377906746982, + "language_loss": 0.75570095, + "learning_rate": 0.000362534072626713, + "loss": 0.76674795, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.16552734, + "step": 3124, + "time_per_iteration": 2.7963545322418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094202, + "balance_loss_mlp": 1.07738113, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.08561674190647896, + "language_loss": 0.81475127, + "learning_rate": 0.00036223456270389093, + "loss": 0.82569331, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.16833496, + "step": 3125, + "time_per_iteration": 2.992478609085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085838, + "balance_loss_mlp": 1.06857657, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.08087477259987003, + "language_loss": 0.80765188, + "learning_rate": 0.00036193510626635517, + "loss": 0.81851029, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.17272949, + "step": 3126, + "time_per_iteration": 2.6718900203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.05972588, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.08853778728712877, + "language_loss": 0.81355464, + "learning_rate": 0.0003616357034303649, + "loss": 0.82432842, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.17663574, + "step": 3127, + "time_per_iteration": 2.9547274112701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075422, + "balance_loss_mlp": 1.05762434, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.1711605115844366, + "language_loss": 0.78441834, + "learning_rate": 0.0003613363543121584, + "loss": 0.79517257, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.17810059, + "step": 3128, + "time_per_iteration": 2.886970281600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_mlp": 1.04813766, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.08758734410380958, + "language_loss": 0.85043442, + "learning_rate": 0.00036103705902795357, + "loss": 0.86108834, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.17260742, + "step": 3129, + "time_per_iteration": 2.748079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072993, + "balance_loss_mlp": 1.0555644, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.09694707916442274, + "language_loss": 0.7971251, + "learning_rate": 0.0003607378176939471, + "loss": 0.80785501, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.17443848, + "step": 3130, + "time_per_iteration": 2.6402640342712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069092, + "balance_loss_mlp": 1.05256987, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.08416157217627585, + "language_loss": 0.82138842, + "learning_rate": 0.00036043863042631465, + "loss": 0.83207935, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.1652832, + "step": 3131, + "time_per_iteration": 2.679304838180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069317, + "balance_loss_mlp": 1.05229378, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.08544531393878185, + "language_loss": 0.76554382, + "learning_rate": 0.00036013949734121133, + "loss": 0.77623701, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.17028809, + "step": 3132, + "time_per_iteration": 3.1334645748138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071138, + "balance_loss_mlp": 1.05466342, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.08104461370045753, + "language_loss": 0.82059807, + "learning_rate": 0.00035984041855477043, + "loss": 0.8313095, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.16467285, + "step": 3133, + "time_per_iteration": 2.7347941398620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045627, + "balance_loss_mlp": 1.03842688, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.025003389778794672, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79755521, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.07177734, + "step": 3134, + "time_per_iteration": 4.970470428466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076232, + "balance_loss_mlp": 1.05934048, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.07365504722099776, + "language_loss": 0.79866755, + "learning_rate": 0.00035924242434230637, + "loss": 0.80942982, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.16906738, + "step": 3135, + "time_per_iteration": 2.7135050296783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.06296587, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.08294049229736823, + "language_loss": 0.78440452, + "learning_rate": 0.00035894350914844516, + "loss": 0.79520017, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.16601562, + "step": 3136, + "time_per_iteration": 2.6597416400909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.06325424, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.08267470686196479, + "language_loss": 0.83196414, + "learning_rate": 0.0003586446487175703, + "loss": 0.84276295, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.16638184, + "step": 3137, + "time_per_iteration": 2.7022488117218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084641, + "balance_loss_mlp": 1.06798732, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.064575038850489, + "language_loss": 0.85214019, + "learning_rate": 0.0003583458431657099, + "loss": 0.86298662, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.16662598, + "step": 3138, + "time_per_iteration": 2.7720208168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084009, + "balance_loss_mlp": 1.06771302, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.09877124262847642, + "language_loss": 0.82838678, + "learning_rate": 0.00035804709260887056, + "loss": 0.83922684, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.16296387, + "step": 3139, + "time_per_iteration": 2.6879312992095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086877, + "balance_loss_mlp": 1.07065237, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.07215366111763855, + "language_loss": 0.8912158, + "learning_rate": 0.0003577483971630373, + "loss": 0.90208459, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.16223145, + "step": 3140, + "time_per_iteration": 2.734809398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085015, + "balance_loss_mlp": 1.06892204, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.05656780869347305, + "language_loss": 0.84707594, + "learning_rate": 0.00035744975694417414, + "loss": 0.85792601, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.16088867, + "step": 3141, + "time_per_iteration": 2.8830533027648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_mlp": 1.06837583, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.12103965495464937, + "language_loss": 0.82471883, + "learning_rate": 0.00035715117206822344, + "loss": 0.83555734, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.15454102, + "step": 3142, + "time_per_iteration": 2.838871479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085944, + "balance_loss_mlp": 1.06989884, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.07532409559899438, + "language_loss": 0.80957747, + "learning_rate": 0.0003568526426511065, + "loss": 0.82043689, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.16040039, + "step": 3143, + "time_per_iteration": 2.646676540374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.07312953, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.09699368707048923, + "language_loss": 0.82747424, + "learning_rate": 0.000356554168808722, + "loss": 0.83836174, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.15612793, + "step": 3144, + "time_per_iteration": 2.9851598739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093552, + "balance_loss_mlp": 1.07773244, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.07251607714921615, + "language_loss": 0.84944451, + "learning_rate": 0.00035625575065694837, + "loss": 0.86037999, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.15808105, + "step": 3145, + "time_per_iteration": 2.8598599433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090889, + "balance_loss_mlp": 1.07443857, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.07064458078135354, + "language_loss": 0.77895433, + "learning_rate": 0.0003559573883116415, + "loss": 0.78986323, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.16455078, + "step": 3146, + "time_per_iteration": 2.733262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089343, + "balance_loss_mlp": 1.07359576, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.07444440196123078, + "language_loss": 0.85480058, + "learning_rate": 0.00035565908188863604, + "loss": 0.86569399, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.15734863, + "step": 3147, + "time_per_iteration": 2.853851079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091792, + "balance_loss_mlp": 1.07599723, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.06196180807513896, + "language_loss": 0.79582435, + "learning_rate": 0.00035536083150374464, + "loss": 0.80674225, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.15783691, + "step": 3148, + "time_per_iteration": 2.776559352874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_mlp": 1.03995907, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.024337037001299088, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75795162, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.07226562, + "step": 3149, + "time_per_iteration": 4.840685129165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091535, + "balance_loss_mlp": 1.07552564, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.06209204496769419, + "language_loss": 0.85722816, + "learning_rate": 0.0003547644993114475, + "loss": 0.8681435, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.16003418, + "step": 3150, + "time_per_iteration": 2.8153529167175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092959, + "balance_loss_mlp": 1.07712793, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.07176933512118068, + "language_loss": 0.79877794, + "learning_rate": 0.00035446641773555806, + "loss": 0.80970764, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.15820312, + "step": 3151, + "time_per_iteration": 2.757474184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094094, + "balance_loss_mlp": 1.0779295, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.10666232173403664, + "language_loss": 0.86817247, + "learning_rate": 0.000354168392660816, + "loss": 0.87911344, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.16162109, + "step": 3152, + "time_per_iteration": 2.7577521800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093009, + "balance_loss_mlp": 1.07742882, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.06835832262029293, + "language_loss": 0.82626665, + "learning_rate": 0.0003538704242029252, + "loss": 0.83719671, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.15576172, + "step": 3153, + "time_per_iteration": 2.7824299335479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096364, + "balance_loss_mlp": 1.08066463, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.07699381631687732, + "language_loss": 0.77828813, + "learning_rate": 0.0003535725124775672, + "loss": 0.7892518, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.15686035, + "step": 3154, + "time_per_iteration": 2.8603780269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101879, + "balance_loss_mlp": 1.085392, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.06603606941894191, + "language_loss": 0.86388272, + "learning_rate": 0.00035327465760040126, + "loss": 0.87490153, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.16491699, + "step": 3155, + "time_per_iteration": 2.731767177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102466, + "balance_loss_mlp": 1.08700442, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.08742295718167487, + "language_loss": 0.84376252, + "learning_rate": 0.00035297685968706526, + "loss": 0.85478723, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.15441895, + "step": 3156, + "time_per_iteration": 2.7879996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099222, + "balance_loss_mlp": 1.08361709, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.07206801524938761, + "language_loss": 0.82717532, + "learning_rate": 0.00035267911885317454, + "loss": 0.83816749, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.15588379, + "step": 3157, + "time_per_iteration": 2.6752853393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096892, + "balance_loss_mlp": 1.08108473, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.06913859395071588, + "language_loss": 0.81624317, + "learning_rate": 0.0003523814352143222, + "loss": 0.8272121, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.15795898, + "step": 3158, + "time_per_iteration": 2.851680040359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096859, + "balance_loss_mlp": 1.08079004, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.07191756501085539, + "language_loss": 0.90879536, + "learning_rate": 0.00035208380888607937, + "loss": 0.91976392, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.16064453, + "step": 3159, + "time_per_iteration": 2.8229289054870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_mlp": 1.02311516, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.017458667771122316, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80492157, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.07080078, + "step": 3160, + "time_per_iteration": 4.860463619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_mlp": 1.01992178, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.015423076795417967, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76719207, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.07080078, + "step": 3161, + "time_per_iteration": 5.027961254119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090745, + "balance_loss_mlp": 1.07459164, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.06716496050507109, + "language_loss": 0.81388539, + "learning_rate": 0.00035119127492038446, + "loss": 0.82479286, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.16149902, + "step": 3162, + "time_per_iteration": 2.8567075729370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_mlp": 1.07425177, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.07519938175586753, + "language_loss": 0.82571161, + "learning_rate": 0.00035089387898984436, + "loss": 0.83661485, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.16064453, + "step": 3163, + "time_per_iteration": 3.0894179344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093705, + "balance_loss_mlp": 1.07734919, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.07531226352360243, + "language_loss": 0.81800103, + "learning_rate": 0.0003505965409474343, + "loss": 0.82893807, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.16357422, + "step": 3164, + "time_per_iteration": 2.909203290939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088443, + "balance_loss_mlp": 1.07221854, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.06350426788679164, + "language_loss": 0.86488736, + "learning_rate": 0.0003502992609085913, + "loss": 0.87577182, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.16223145, + "step": 3165, + "time_per_iteration": 2.6909096240997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087782, + "balance_loss_mlp": 1.07146227, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.0979130476844587, + "language_loss": 0.82205462, + "learning_rate": 0.00035000203898872954, + "loss": 0.83293247, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.16320801, + "step": 3166, + "time_per_iteration": 3.0287840366363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092824, + "balance_loss_mlp": 1.07664716, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.10375532619284132, + "language_loss": 0.84533244, + "learning_rate": 0.0003497048753032406, + "loss": 0.85626066, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.16174316, + "step": 3167, + "time_per_iteration": 2.8883583545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092408, + "balance_loss_mlp": 1.07648182, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.06471277204040406, + "language_loss": 0.80592054, + "learning_rate": 0.000349407769967494, + "loss": 0.81684464, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.15917969, + "step": 3168, + "time_per_iteration": 3.386155605316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099872, + "balance_loss_mlp": 1.08381498, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.11400005862882004, + "language_loss": 0.84987879, + "learning_rate": 0.0003491107230968361, + "loss": 0.86087751, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.16052246, + "step": 3169, + "time_per_iteration": 2.6899755001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096804, + "balance_loss_mlp": 1.08061576, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.06652355990642472, + "language_loss": 0.81221354, + "learning_rate": 0.00034881373480659085, + "loss": 0.82318163, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.16186523, + "step": 3170, + "time_per_iteration": 2.8547778129577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101156, + "balance_loss_mlp": 1.08508694, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.08688268797683278, + "language_loss": 0.77884257, + "learning_rate": 0.0003485168052120594, + "loss": 0.78985405, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.16064453, + "step": 3171, + "time_per_iteration": 2.6543068885803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110044, + "balance_loss_mlp": 1.08477592, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.09027989422234346, + "language_loss": 0.79380625, + "learning_rate": 0.00034821993442851973, + "loss": 0.80481064, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.15649414, + "step": 3172, + "time_per_iteration": 2.6117188930511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100941, + "balance_loss_mlp": 1.08522928, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.1005367012587997, + "language_loss": 0.82141685, + "learning_rate": 0.00034792312257122735, + "loss": 0.83242625, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.15698242, + "step": 3173, + "time_per_iteration": 2.634824752807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107115, + "balance_loss_mlp": 1.09201097, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.07806982240241292, + "language_loss": 0.80516702, + "learning_rate": 0.00034762636975541506, + "loss": 0.81623822, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.15087891, + "step": 3174, + "time_per_iteration": 2.7511277198791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111247, + "balance_loss_mlp": 1.09719944, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.09012937190678837, + "language_loss": 0.80371904, + "learning_rate": 0.0003473296760962923, + "loss": 0.81484377, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.15246582, + "step": 3175, + "time_per_iteration": 2.7333414554595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105343, + "balance_loss_mlp": 1.04603887, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.017873347223140334, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79587168, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.07373047, + "step": 3176, + "time_per_iteration": 4.656734943389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112456, + "balance_loss_mlp": 1.10965848, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.07170779608360676, + "language_loss": 0.81361848, + "learning_rate": 0.00034673646670883976, + "loss": 0.82486403, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.14892578, + "step": 3177, + "time_per_iteration": 2.9838032722473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_mlp": 1.04572225, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.018001303469989904, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76768184, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.07421875, + "step": 3178, + "time_per_iteration": 4.987392425537109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130172, + "balance_loss_mlp": 1.11506796, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.0710561364168879, + "language_loss": 0.82215559, + "learning_rate": 0.0003461434953300865, + "loss": 0.83345723, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.15075684, + "step": 3179, + "time_per_iteration": 2.972102165222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129428, + "balance_loss_mlp": 1.11437213, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.06625806695927375, + "language_loss": 0.81118929, + "learning_rate": 0.0003458470991817515, + "loss": 0.82248354, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.15039062, + "step": 3180, + "time_per_iteration": 2.9920318126678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.12371635, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.09554430463950304, + "language_loss": 0.84819943, + "learning_rate": 0.0003455507628808802, + "loss": 0.8595888, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.15197754, + "step": 3181, + "time_per_iteration": 2.620678424835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138152, + "balance_loss_mlp": 1.12281001, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.07764809477009631, + "language_loss": 0.84588206, + "learning_rate": 0.00034525448654252076, + "loss": 0.85726357, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.15319824, + "step": 3182, + "time_per_iteration": 2.662243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132309, + "balance_loss_mlp": 1.11651397, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.08919622612772353, + "language_loss": 0.8301183, + "learning_rate": 0.0003449582702816976, + "loss": 0.84144139, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.15783691, + "step": 3183, + "time_per_iteration": 2.696509599685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131366, + "balance_loss_mlp": 1.11577392, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.07246136408920362, + "language_loss": 0.82839715, + "learning_rate": 0.0003446621142134122, + "loss": 0.83971083, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.15576172, + "step": 3184, + "time_per_iteration": 2.6876282691955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129534, + "balance_loss_mlp": 1.1142869, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.10207734274681185, + "language_loss": 0.84166813, + "learning_rate": 0.0003443660184526424, + "loss": 0.85296345, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.15222168, + "step": 3185, + "time_per_iteration": 2.457191228866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126874, + "balance_loss_mlp": 1.11150801, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.08690649590486366, + "language_loss": 0.86419243, + "learning_rate": 0.0003440699831143429, + "loss": 0.8754611, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.15356445, + "step": 3186, + "time_per_iteration": 2.7862656116485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117106, + "balance_loss_mlp": 1.10134614, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.09433598630753232, + "language_loss": 0.82150078, + "learning_rate": 0.0003437740083134449, + "loss": 0.83267176, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.1574707, + "step": 3187, + "time_per_iteration": 2.732182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110686, + "balance_loss_mlp": 1.09489119, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.107565485764287, + "language_loss": 0.83600903, + "learning_rate": 0.00034347809416485574, + "loss": 0.84711587, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.15783691, + "step": 3188, + "time_per_iteration": 2.5941028594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.09327221, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.07306418964956934, + "language_loss": 0.81643283, + "learning_rate": 0.0003431822407834597, + "loss": 0.82752192, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.15625, + "step": 3189, + "time_per_iteration": 2.79345440864563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107509, + "balance_loss_mlp": 1.09151149, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.07663580973151435, + "language_loss": 0.83989727, + "learning_rate": 0.00034288644828411706, + "loss": 0.85097235, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.15991211, + "step": 3190, + "time_per_iteration": 3.495431423187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107683, + "balance_loss_mlp": 1.0914706, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.09805760174561111, + "language_loss": 0.75479543, + "learning_rate": 0.0003425907167816649, + "loss": 0.76587236, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.16210938, + "step": 3191, + "time_per_iteration": 2.890688896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100168, + "balance_loss_mlp": 1.0839076, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.08119558149243, + "language_loss": 0.84596795, + "learning_rate": 0.00034229504639091623, + "loss": 0.85696959, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.16259766, + "step": 3192, + "time_per_iteration": 2.799213171005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110546, + "balance_loss_mlp": 1.08940232, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.13197057459029027, + "language_loss": 0.79937923, + "learning_rate": 0.0003419994372266606, + "loss": 0.81043386, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.16052246, + "step": 3193, + "time_per_iteration": 3.1180262565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103547, + "balance_loss_mlp": 1.08715582, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.07478792325095046, + "language_loss": 0.81555808, + "learning_rate": 0.00034170388940366335, + "loss": 0.82659352, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.16381836, + "step": 3194, + "time_per_iteration": 2.7108078002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.08935523, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.1666581336707107, + "language_loss": 0.80146444, + "learning_rate": 0.0003414084030366667, + "loss": 0.81251997, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.16210938, + "step": 3195, + "time_per_iteration": 3.146375894546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098435, + "balance_loss_mlp": 1.08159113, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.07855669714866301, + "language_loss": 0.82993454, + "learning_rate": 0.0003411129782403883, + "loss": 0.8409189, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.1685791, + "step": 3196, + "time_per_iteration": 2.6907546520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102656, + "balance_loss_mlp": 1.0864203, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.08662161159961286, + "language_loss": 0.84978783, + "learning_rate": 0.0003408176151295225, + "loss": 0.86081439, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.16235352, + "step": 3197, + "time_per_iteration": 2.7353785037994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098614, + "balance_loss_mlp": 1.08207965, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.11963983083590954, + "language_loss": 0.77372497, + "learning_rate": 0.00034052231381873944, + "loss": 0.78471112, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.16540527, + "step": 3198, + "time_per_iteration": 2.673388957977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097817, + "balance_loss_mlp": 1.08129418, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.07877091537638886, + "language_loss": 0.84876865, + "learning_rate": 0.00034022707442268494, + "loss": 0.85974681, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.1652832, + "step": 3199, + "time_per_iteration": 2.5626182556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090925, + "balance_loss_mlp": 1.07454538, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.07568498479176501, + "language_loss": 0.81815386, + "learning_rate": 0.0003399318970559813, + "loss": 0.82906306, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.16381836, + "step": 3200, + "time_per_iteration": 2.829237461090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108966, + "balance_loss_mlp": 1.07353139, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.2497942099132976, + "language_loss": 0.8433665, + "learning_rate": 0.00033963678183322656, + "loss": 0.85426307, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.16125488, + "step": 3201, + "time_per_iteration": 3.1063387393951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087739, + "balance_loss_mlp": 1.07162154, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.06940460952874025, + "language_loss": 0.82539898, + "learning_rate": 0.0003393417288689945, + "loss": 0.83627635, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.16113281, + "step": 3202, + "time_per_iteration": 2.7065072059631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093588, + "balance_loss_mlp": 1.0775063, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.08060008317875632, + "language_loss": 0.75810564, + "learning_rate": 0.00033904673827783504, + "loss": 0.76904154, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.1607666, + "step": 3203, + "time_per_iteration": 2.976076364517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091051, + "balance_loss_mlp": 1.07505345, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.05609765928721304, + "language_loss": 0.81290334, + "learning_rate": 0.00033875181017427357, + "loss": 0.8238138, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.15991211, + "step": 3204, + "time_per_iteration": 2.617102861404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094322, + "balance_loss_mlp": 1.0783596, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.06962026765049416, + "language_loss": 0.80802751, + "learning_rate": 0.00033845694467281133, + "loss": 0.81897068, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.1595459, + "step": 3205, + "time_per_iteration": 2.9406063556671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100233, + "balance_loss_mlp": 1.08492684, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.08157941962089017, + "language_loss": 0.83428419, + "learning_rate": 0.00033816214188792516, + "loss": 0.84528655, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.1529541, + "step": 3206, + "time_per_iteration": 3.1819798946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097691, + "balance_loss_mlp": 1.08228946, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.0725317157216798, + "language_loss": 0.85080433, + "learning_rate": 0.00033786740193406784, + "loss": 0.86178124, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.15380859, + "step": 3207, + "time_per_iteration": 2.5949695110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099512, + "balance_loss_mlp": 1.08397925, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.09100196338205928, + "language_loss": 0.81269908, + "learning_rate": 0.00033757272492566736, + "loss": 0.82369423, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.15515137, + "step": 3208, + "time_per_iteration": 2.896113157272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101736, + "balance_loss_mlp": 1.08576202, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.061084762656912546, + "language_loss": 0.86857277, + "learning_rate": 0.0003372781109771278, + "loss": 0.87959015, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.15966797, + "step": 3209, + "time_per_iteration": 2.7744648456573486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098286, + "balance_loss_mlp": 1.08235943, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.0666635733945454, + "language_loss": 0.7634722, + "learning_rate": 0.0003369835602028281, + "loss": 0.77445507, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.15917969, + "step": 3210, + "time_per_iteration": 2.807690143585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109414, + "balance_loss_mlp": 1.07845259, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.06505304980204422, + "language_loss": 0.79307866, + "learning_rate": 0.0003366890727171232, + "loss": 0.80402005, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.15673828, + "step": 3211, + "time_per_iteration": 2.6847074031829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092887, + "balance_loss_mlp": 1.07701993, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.08815950120803863, + "language_loss": 0.78273273, + "learning_rate": 0.00033639464863434313, + "loss": 0.79366159, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.15856934, + "step": 3212, + "time_per_iteration": 2.6401009559631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_mlp": 1.0277102, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.026269033760010364, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79478669, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.08496094, + "step": 3213, + "time_per_iteration": 4.715362787246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.07037783, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.0738307593479646, + "language_loss": 0.79866982, + "learning_rate": 0.00033580599113475543, + "loss": 0.80953264, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.15893555, + "step": 3214, + "time_per_iteration": 3.000586986541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085789, + "balance_loss_mlp": 1.06937385, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.07082068470291375, + "language_loss": 0.86112303, + "learning_rate": 0.00033551175794648507, + "loss": 0.87198091, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.16418457, + "step": 3215, + "time_per_iteration": 2.494271755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090218, + "balance_loss_mlp": 1.0744828, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.12386747006326235, + "language_loss": 0.81595516, + "learning_rate": 0.00033521758861821365, + "loss": 0.82685733, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.15722656, + "step": 3216, + "time_per_iteration": 2.646888256072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084021, + "balance_loss_mlp": 1.06845176, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.07895450419788622, + "language_loss": 0.88963878, + "learning_rate": 0.0003349234832641479, + "loss": 0.90047896, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.15551758, + "step": 3217, + "time_per_iteration": 2.603308916091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_mlp": 1.06719124, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.07412246330535043, + "language_loss": 0.808752, + "learning_rate": 0.00033462944199846975, + "loss": 0.81957746, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.15332031, + "step": 3218, + "time_per_iteration": 3.086716413497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083407, + "balance_loss_mlp": 1.06795752, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07145505501141985, + "language_loss": 0.86298114, + "learning_rate": 0.00033433546493533606, + "loss": 0.87381524, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.15429688, + "step": 3219, + "time_per_iteration": 2.525264024734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079767, + "balance_loss_mlp": 1.06349516, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.086291171152169, + "language_loss": 0.83994114, + "learning_rate": 0.00033404155218887897, + "loss": 0.85073888, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.16271973, + "step": 3220, + "time_per_iteration": 2.7530763149261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080633, + "balance_loss_mlp": 1.06478977, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.11530682173053017, + "language_loss": 0.87328637, + "learning_rate": 0.00033374770387320534, + "loss": 0.88409269, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.1583252, + "step": 3221, + "time_per_iteration": 2.769804000854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107728, + "balance_loss_mlp": 1.06110358, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.09653805931546991, + "language_loss": 0.84981918, + "learning_rate": 0.00033345392010239737, + "loss": 0.86059201, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.16174316, + "step": 3222, + "time_per_iteration": 2.742431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080173, + "balance_loss_mlp": 1.0643059, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.08405780593634497, + "language_loss": 0.82221037, + "learning_rate": 0.0003331602009905118, + "loss": 0.8330121, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.15856934, + "step": 3223, + "time_per_iteration": 2.8276350498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075434, + "balance_loss_mlp": 1.05924559, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.16424334065153295, + "language_loss": 0.83946419, + "learning_rate": 0.00033286654665158085, + "loss": 0.85021853, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.16186523, + "step": 3224, + "time_per_iteration": 3.0171141624450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074402, + "balance_loss_mlp": 1.05797529, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.07119512834175158, + "language_loss": 0.8751117, + "learning_rate": 0.0003325729571996109, + "loss": 0.88585573, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.16430664, + "step": 3225, + "time_per_iteration": 2.6336770057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107489, + "balance_loss_mlp": 1.05822468, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.07015160541541936, + "language_loss": 0.83497381, + "learning_rate": 0.000332279432748584, + "loss": 0.84572268, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.16674805, + "step": 3226, + "time_per_iteration": 2.8068268299102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077491, + "balance_loss_mlp": 1.06129014, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.08244551299177609, + "language_loss": 0.87847024, + "learning_rate": 0.00033198597341245576, + "loss": 0.88924515, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.1619873, + "step": 3227, + "time_per_iteration": 2.6014742851257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070414, + "balance_loss_mlp": 1.05366468, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.25336628226947533, + "language_loss": 0.82029134, + "learning_rate": 0.00033169257930515763, + "loss": 0.83099544, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.16760254, + "step": 3228, + "time_per_iteration": 3.086378335952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080291, + "balance_loss_mlp": 1.06385183, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.06847993393240591, + "language_loss": 0.81926602, + "learning_rate": 0.0003313992505405951, + "loss": 0.83006895, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.16442871, + "step": 3229, + "time_per_iteration": 2.721404552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108161, + "balance_loss_mlp": 1.06523085, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.08774924487902723, + "language_loss": 0.81243527, + "learning_rate": 0.0003311059872326487, + "loss": 0.82325131, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.16381836, + "step": 3230, + "time_per_iteration": 2.698370933532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.07283747, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.06270851897860089, + "language_loss": 0.79239869, + "learning_rate": 0.0003308127894951734, + "loss": 0.80328983, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.16271973, + "step": 3231, + "time_per_iteration": 2.642587900161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103422, + "balance_loss_mlp": 1.08775783, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.08661735945453952, + "language_loss": 0.86286879, + "learning_rate": 0.00033051965744199834, + "loss": 0.87390304, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.15649414, + "step": 3232, + "time_per_iteration": 2.7654480934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110456, + "balance_loss_mlp": 1.08876467, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.08070984322149112, + "language_loss": 0.90182227, + "learning_rate": 0.0003302265911869276, + "loss": 0.91286784, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.15795898, + "step": 3233, + "time_per_iteration": 2.973137378692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102412, + "balance_loss_mlp": 1.0863899, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.10903375315804033, + "language_loss": 0.83981085, + "learning_rate": 0.0003299335908437397, + "loss": 0.85083497, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.16015625, + "step": 3234, + "time_per_iteration": 2.6683669090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110257, + "balance_loss_mlp": 1.08685815, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08931018897921299, + "language_loss": 0.79380894, + "learning_rate": 0.0003296406565261873, + "loss": 0.8048346, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.15698242, + "step": 3235, + "time_per_iteration": 2.4825046062469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093994, + "balance_loss_mlp": 1.07830596, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.08356203677031868, + "language_loss": 0.84839869, + "learning_rate": 0.0003293477883479978, + "loss": 0.85933864, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.15673828, + "step": 3236, + "time_per_iteration": 2.855417013168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.08046174, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.0752906084942527, + "language_loss": 0.79873055, + "learning_rate": 0.0003290549864228727, + "loss": 0.80969167, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.15637207, + "step": 3237, + "time_per_iteration": 2.954319953918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094914, + "balance_loss_mlp": 1.07898724, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.0798274919474459, + "language_loss": 0.86145848, + "learning_rate": 0.0003287622508644875, + "loss": 0.87240762, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.15917969, + "step": 3238, + "time_per_iteration": 2.7834508419036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095092, + "balance_loss_mlp": 1.07920146, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.08228635643627878, + "language_loss": 0.86427939, + "learning_rate": 0.0003284695817864923, + "loss": 0.87523031, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.15881348, + "step": 3239, + "time_per_iteration": 2.52299427986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089491, + "balance_loss_mlp": 1.07388628, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.07912840789320032, + "language_loss": 0.83886796, + "learning_rate": 0.0003281769793025116, + "loss": 0.84976286, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.15588379, + "step": 3240, + "time_per_iteration": 2.736513614654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090298, + "balance_loss_mlp": 1.07525432, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.08036892690919402, + "language_loss": 0.89556086, + "learning_rate": 0.00032788444352614346, + "loss": 0.90646392, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.15014648, + "step": 3241, + "time_per_iteration": 2.532486915588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091662, + "balance_loss_mlp": 1.07645059, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.10748346186941515, + "language_loss": 0.80754519, + "learning_rate": 0.0003275919745709606, + "loss": 0.81846178, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.15197754, + "step": 3242, + "time_per_iteration": 2.6164467334747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093762, + "balance_loss_mlp": 1.07853913, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.07410139780614007, + "language_loss": 0.82327247, + "learning_rate": 0.00032729957255050936, + "loss": 0.83421004, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.15197754, + "step": 3243, + "time_per_iteration": 2.711912155151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094763, + "balance_loss_mlp": 1.07895613, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.07913543428232035, + "language_loss": 0.81355995, + "learning_rate": 0.0003270072375783102, + "loss": 0.82450759, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.15795898, + "step": 3244, + "time_per_iteration": 2.896878242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091166, + "balance_loss_mlp": 1.07548952, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.10691714389102631, + "language_loss": 0.79955053, + "learning_rate": 0.00032671496976785774, + "loss": 0.81046224, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.15661621, + "step": 3245, + "time_per_iteration": 2.6352155208587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089637, + "balance_loss_mlp": 1.07429433, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06870861562769151, + "language_loss": 0.75493729, + "learning_rate": 0.0003264227692326205, + "loss": 0.76583362, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.15319824, + "step": 3246, + "time_per_iteration": 3.093111991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098568, + "balance_loss_mlp": 1.08292735, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.06424326039406808, + "language_loss": 0.85849744, + "learning_rate": 0.00032613063608604055, + "loss": 0.86948311, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.15625, + "step": 3247, + "time_per_iteration": 2.5499489307403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109153, + "balance_loss_mlp": 1.07599711, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.07629898718313471, + "language_loss": 0.83584791, + "learning_rate": 0.0003258385704415343, + "loss": 0.84676319, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.15515137, + "step": 3248, + "time_per_iteration": 2.6027162075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098413, + "balance_loss_mlp": 1.08287978, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.08365862742240879, + "language_loss": 0.83149463, + "learning_rate": 0.0003255465724124915, + "loss": 0.84247875, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.15515137, + "step": 3249, + "time_per_iteration": 2.730041742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104065, + "balance_loss_mlp": 1.08905637, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.06996210477337128, + "language_loss": 0.82732821, + "learning_rate": 0.00032525464211227587, + "loss": 0.83836889, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.14990234, + "step": 3250, + "time_per_iteration": 2.610226631164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103584, + "balance_loss_mlp": 1.08822954, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.07802302552021714, + "language_loss": 0.85721552, + "learning_rate": 0.0003249627796542249, + "loss": 0.86825138, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.15344238, + "step": 3251, + "time_per_iteration": 2.6803338527679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096481, + "balance_loss_mlp": 1.08087671, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06796886597931054, + "language_loss": 0.84280014, + "learning_rate": 0.00032467098515164943, + "loss": 0.85376501, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.15588379, + "step": 3252, + "time_per_iteration": 2.904672861099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111129, + "balance_loss_mlp": 1.0956316, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.09344441617703737, + "language_loss": 0.84051675, + "learning_rate": 0.00032437925871783456, + "loss": 0.85162807, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.15490723, + "step": 3253, + "time_per_iteration": 2.704474925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_mlp": 1.08483791, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.07749015001842677, + "language_loss": 0.84249985, + "learning_rate": 0.00032408760046603803, + "loss": 0.85351181, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.16357422, + "step": 3254, + "time_per_iteration": 2.849126100540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103151, + "balance_loss_mlp": 1.08711767, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.06356173673048542, + "language_loss": 0.77591729, + "learning_rate": 0.00032379601050949193, + "loss": 0.7869488, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.16027832, + "step": 3255, + "time_per_iteration": 3.119446039199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091759, + "balance_loss_mlp": 1.07567763, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.07099798936628814, + "language_loss": 0.88052809, + "learning_rate": 0.0003235044889614013, + "loss": 0.8914457, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.1607666, + "step": 3256, + "time_per_iteration": 2.613060235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094317, + "balance_loss_mlp": 1.07879567, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.09103285060776488, + "language_loss": 0.8368516, + "learning_rate": 0.0003232130359349451, + "loss": 0.84779477, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.1550293, + "step": 3257, + "time_per_iteration": 2.8671774864196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089032, + "balance_loss_mlp": 1.07287872, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.0836607688375681, + "language_loss": 0.81645948, + "learning_rate": 0.0003229216515432751, + "loss": 0.82734984, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.16149902, + "step": 3258, + "time_per_iteration": 2.8217055797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093741, + "balance_loss_mlp": 1.0781126, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.07437080519931394, + "language_loss": 0.79591352, + "learning_rate": 0.0003226303358995174, + "loss": 0.80685091, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.15612793, + "step": 3259, + "time_per_iteration": 2.613922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109242, + "balance_loss_mlp": 1.07588553, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.06263163093589014, + "language_loss": 0.88819879, + "learning_rate": 0.00032233908911677, + "loss": 0.89912301, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.16540527, + "step": 3260, + "time_per_iteration": 2.855600118637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109092, + "balance_loss_mlp": 1.07450485, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.06460016363514721, + "language_loss": 0.80802065, + "learning_rate": 0.0003220479113081053, + "loss": 0.81892991, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.16418457, + "step": 3261, + "time_per_iteration": 2.753509759902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_mlp": 1.06910312, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.074937478592973, + "language_loss": 0.79032731, + "learning_rate": 0.00032175680258656836, + "loss": 0.80117977, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.16137695, + "step": 3262, + "time_per_iteration": 2.7336065769195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.06954229, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.06015193391132931, + "language_loss": 0.79762304, + "learning_rate": 0.00032146576306517794, + "loss": 0.80847853, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.16003418, + "step": 3263, + "time_per_iteration": 2.8162710666656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087014, + "balance_loss_mlp": 1.070611, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.08732390262483163, + "language_loss": 0.80907923, + "learning_rate": 0.0003211747928569255, + "loss": 0.81994939, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.1640625, + "step": 3264, + "time_per_iteration": 2.7805709838867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087684, + "balance_loss_mlp": 1.07150757, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.06366142393715324, + "language_loss": 0.81574047, + "learning_rate": 0.0003208838920747754, + "loss": 0.82661736, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.16174316, + "step": 3265, + "time_per_iteration": 2.8634932041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07176387, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.06892871755232625, + "language_loss": 0.76471019, + "learning_rate": 0.0003205930608316656, + "loss": 0.77558672, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.15881348, + "step": 3266, + "time_per_iteration": 3.491633176803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088752, + "balance_loss_mlp": 1.07274199, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.07065676872193134, + "language_loss": 0.84763551, + "learning_rate": 0.00032030229924050673, + "loss": 0.85852307, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.16003418, + "step": 3267, + "time_per_iteration": 2.7322630882263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081627, + "balance_loss_mlp": 1.0655694, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.076810738762244, + "language_loss": 0.80159783, + "learning_rate": 0.00032001160741418247, + "loss": 0.81241405, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.16052246, + "step": 3268, + "time_per_iteration": 2.683931589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.06859946, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.07050633409019491, + "language_loss": 0.81839114, + "learning_rate": 0.0003197209854655494, + "loss": 0.82922828, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.15100098, + "step": 3269, + "time_per_iteration": 2.7007665634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.0728085, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.07859150018843152, + "language_loss": 0.74576277, + "learning_rate": 0.0003194304335074371, + "loss": 0.75664711, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.15625, + "step": 3270, + "time_per_iteration": 2.8443710803985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093344, + "balance_loss_mlp": 1.07737029, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.07641817393063347, + "language_loss": 0.88118923, + "learning_rate": 0.0003191399516526475, + "loss": 0.89212275, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.15966797, + "step": 3271, + "time_per_iteration": 2.510565996170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109192, + "balance_loss_mlp": 1.07666111, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.06496379597485666, + "language_loss": 0.79376519, + "learning_rate": 0.0003188495400139559, + "loss": 0.8046844, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.15234375, + "step": 3272, + "time_per_iteration": 2.8364667892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095867, + "balance_loss_mlp": 1.0803932, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.07122529047297946, + "language_loss": 0.8439455, + "learning_rate": 0.00031855919870411013, + "loss": 0.85490417, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.15466309, + "step": 3273, + "time_per_iteration": 2.8570995330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086291, + "balance_loss_mlp": 1.0712353, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.06914500829494513, + "language_loss": 0.84985608, + "learning_rate": 0.0003182689278358305, + "loss": 0.86071897, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.15039062, + "step": 3274, + "time_per_iteration": 2.757631301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108922, + "balance_loss_mlp": 1.07361603, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.07954775406848916, + "language_loss": 0.79536891, + "learning_rate": 0.0003179787275218105, + "loss": 0.80626118, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.15588379, + "step": 3275, + "time_per_iteration": 2.562164545059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083819, + "balance_loss_mlp": 1.06884634, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.08328401336331384, + "language_loss": 0.84322137, + "learning_rate": 0.0003176885978747155, + "loss": 0.85405958, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.14953613, + "step": 3276, + "time_per_iteration": 2.6230828762054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085747, + "balance_loss_mlp": 1.07017803, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.1699824723402015, + "language_loss": 0.82447994, + "learning_rate": 0.0003173985390071839, + "loss": 0.8353374, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.15551758, + "step": 3277, + "time_per_iteration": 2.913857936859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011052, + "balance_loss_mlp": 1.00342274, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.01180096248497286, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78911507, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.07617188, + "step": 3278, + "time_per_iteration": 4.810575008392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095394, + "balance_loss_mlp": 1.07975388, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.07584058368204265, + "language_loss": 0.81100649, + "learning_rate": 0.00031681863406122704, + "loss": 0.82196045, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.15625, + "step": 3279, + "time_per_iteration": 2.8176543712615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094938, + "balance_loss_mlp": 1.07984567, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.07145164235931235, + "language_loss": 0.85147798, + "learning_rate": 0.00031652878820794087, + "loss": 0.86242729, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.15063477, + "step": 3280, + "time_per_iteration": 3.010453462600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099528, + "balance_loss_mlp": 1.08434081, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.08537377503877883, + "language_loss": 0.85849619, + "learning_rate": 0.00031623901358449627, + "loss": 0.86949146, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.15161133, + "step": 3281, + "time_per_iteration": 2.6708781719207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101817, + "balance_loss_mlp": 1.08709431, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.05886068654642298, + "language_loss": 0.88589537, + "learning_rate": 0.0003159493103033936, + "loss": 0.89691359, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.14709473, + "step": 3282, + "time_per_iteration": 2.636570930480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023039, + "balance_loss_mlp": 1.01540971, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.014741970221396734, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80942094, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.07617188, + "step": 3283, + "time_per_iteration": 4.921837568283081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098019, + "balance_loss_mlp": 1.08298671, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.06611749936023467, + "language_loss": 0.82335258, + "learning_rate": 0.0003153701182180776, + "loss": 0.83433276, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.15014648, + "step": 3284, + "time_per_iteration": 2.804680824279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100744, + "balance_loss_mlp": 1.08583045, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.09468051023791588, + "language_loss": 0.81480467, + "learning_rate": 0.00031508062963872655, + "loss": 0.8258121, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.14892578, + "step": 3285, + "time_per_iteration": 2.618572950363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104547, + "balance_loss_mlp": 1.08974171, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.07285431421686336, + "language_loss": 0.79529119, + "learning_rate": 0.0003147912128514423, + "loss": 0.80633664, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.14794922, + "step": 3286, + "time_per_iteration": 2.7349414825439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112049, + "balance_loss_mlp": 1.0971601, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.07001944194285717, + "language_loss": 0.87457585, + "learning_rate": 0.0003145018679685859, + "loss": 0.88569629, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.14868164, + "step": 3287, + "time_per_iteration": 2.735057830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.09238625, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.06287056538994153, + "language_loss": 0.87662357, + "learning_rate": 0.00031421259510249134, + "loss": 0.88769281, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.1451416, + "step": 3288, + "time_per_iteration": 2.7864692211151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112538, + "balance_loss_mlp": 1.09816122, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.07989548298416052, + "language_loss": 0.80931014, + "learning_rate": 0.00031392339436546414, + "loss": 0.82043552, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.14355469, + "step": 3289, + "time_per_iteration": 2.8174936771392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110895, + "balance_loss_mlp": 1.09549332, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.0967935034115468, + "language_loss": 0.83535063, + "learning_rate": 0.00031363426586978205, + "loss": 0.84645951, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.15380859, + "step": 3290, + "time_per_iteration": 2.7781615257263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.09155595, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.07036168037167431, + "language_loss": 0.84420347, + "learning_rate": 0.0003133452097276947, + "loss": 0.8552683, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.14904785, + "step": 3291, + "time_per_iteration": 2.7578635215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098733, + "balance_loss_mlp": 1.08364153, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.07346038815510673, + "language_loss": 0.84298337, + "learning_rate": 0.0003130562260514238, + "loss": 0.85397065, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.15075684, + "step": 3292, + "time_per_iteration": 2.798175096511841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.07720733, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.07455275976827726, + "language_loss": 0.81438339, + "learning_rate": 0.0003127673149531626, + "loss": 0.8253122, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.15661621, + "step": 3293, + "time_per_iteration": 2.7655112743377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086095, + "balance_loss_mlp": 1.07095516, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.083592197063536, + "language_loss": 0.83216, + "learning_rate": 0.0003124784765450762, + "loss": 0.84302098, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.15124512, + "step": 3294, + "time_per_iteration": 2.5880134105682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092147, + "balance_loss_mlp": 1.07686436, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.09213521836591561, + "language_loss": 0.79931903, + "learning_rate": 0.0003121897109393017, + "loss": 0.81024045, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.15283203, + "step": 3295, + "time_per_iteration": 2.7655093669891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086202, + "balance_loss_mlp": 1.07047844, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.06242699112369121, + "language_loss": 0.88973814, + "learning_rate": 0.0003119010182479481, + "loss": 0.90060019, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.15710449, + "step": 3296, + "time_per_iteration": 2.631047010421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.07093644, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.06994096564397366, + "language_loss": 0.82599872, + "learning_rate": 0.00031161239858309563, + "loss": 0.83686233, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.15405273, + "step": 3297, + "time_per_iteration": 2.599755048751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086667, + "balance_loss_mlp": 1.07108665, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.09286327126840728, + "language_loss": 0.8328709, + "learning_rate": 0.0003113238520567964, + "loss": 0.8437376, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.15563965, + "step": 3298, + "time_per_iteration": 2.728113889694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088611, + "balance_loss_mlp": 1.07316184, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.09050699432092259, + "language_loss": 0.81456614, + "learning_rate": 0.00031103537878107403, + "loss": 0.82545221, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.15441895, + "step": 3299, + "time_per_iteration": 2.746675729751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_mlp": 1.07576215, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.08418360382923895, + "language_loss": 0.7968322, + "learning_rate": 0.0003107469788679238, + "loss": 0.8077426, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.15246582, + "step": 3300, + "time_per_iteration": 2.7789735794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086389, + "balance_loss_mlp": 1.07030737, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.07428233457329445, + "language_loss": 0.86447507, + "learning_rate": 0.00031045865242931267, + "loss": 0.87533897, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.1607666, + "step": 3301, + "time_per_iteration": 2.8069655895233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096867, + "balance_loss_mlp": 1.08088112, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.07374364047073086, + "language_loss": 0.83124268, + "learning_rate": 0.00031017039957717877, + "loss": 0.84221137, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.15979004, + "step": 3302, + "time_per_iteration": 3.0203216075897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109177, + "balance_loss_mlp": 1.07607031, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.08011037824004849, + "language_loss": 0.88448334, + "learning_rate": 0.0003098822204234318, + "loss": 0.895401, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.15686035, + "step": 3303, + "time_per_iteration": 2.722560405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086918, + "balance_loss_mlp": 1.07146788, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.14532397692109592, + "language_loss": 0.87361807, + "learning_rate": 0.00030959411507995273, + "loss": 0.88448727, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.15429688, + "step": 3304, + "time_per_iteration": 3.2270877361297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109049, + "balance_loss_mlp": 1.07495642, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.07985404208202107, + "language_loss": 0.80787814, + "learning_rate": 0.00030930608365859407, + "loss": 0.8187831, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.15515137, + "step": 3305, + "time_per_iteration": 2.7090413570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.07174611, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.731689338993936, + "language_loss": 0.87885678, + "learning_rate": 0.00030901812627117943, + "loss": 0.88973522, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.16088867, + "step": 3306, + "time_per_iteration": 2.6327977180480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077096, + "balance_loss_mlp": 1.06109858, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.09002939621512045, + "language_loss": 0.84808385, + "learning_rate": 0.000308730243029504, + "loss": 0.85885489, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.15979004, + "step": 3307, + "time_per_iteration": 2.6054556369781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088589, + "balance_loss_mlp": 1.07207811, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.0753497997145879, + "language_loss": 0.79653525, + "learning_rate": 0.0003084424340453339, + "loss": 0.80742109, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.16516113, + "step": 3308, + "time_per_iteration": 2.8042142391204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095017, + "balance_loss_mlp": 1.0775888, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.08328342026231418, + "language_loss": 0.82059419, + "learning_rate": 0.0003081546994304064, + "loss": 0.8315444, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.17443848, + "step": 3309, + "time_per_iteration": 2.7940802574157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100113, + "balance_loss_mlp": 1.08294737, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.07711723091328526, + "language_loss": 0.81634271, + "learning_rate": 0.0003078670392964298, + "loss": 0.82734382, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.17175293, + "step": 3310, + "time_per_iteration": 2.6288981437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111543, + "balance_loss_mlp": 1.09684515, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.09648821040849707, + "language_loss": 0.83039993, + "learning_rate": 0.00030757945375508406, + "loss": 0.84155422, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.18591309, + "step": 3311, + "time_per_iteration": 2.680053472518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120459, + "balance_loss_mlp": 1.10194564, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.07648325408881881, + "language_loss": 0.81110901, + "learning_rate": 0.00030729194291801944, + "loss": 0.82231361, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.18518066, + "step": 3312, + "time_per_iteration": 2.7345173358917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124831, + "balance_loss_mlp": 1.10598445, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.1187576427749129, + "language_loss": 0.76967251, + "learning_rate": 0.00030700450689685787, + "loss": 0.78092086, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.18847656, + "step": 3313, + "time_per_iteration": 2.5925910472869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134943, + "balance_loss_mlp": 1.11620378, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.086714433395562, + "language_loss": 0.85812229, + "learning_rate": 0.00030671714580319186, + "loss": 0.86947167, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.18762207, + "step": 3314, + "time_per_iteration": 2.8684160709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10954893, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07885995957457764, + "language_loss": 0.83140874, + "learning_rate": 0.0003064298597485846, + "loss": 0.84269553, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.19116211, + "step": 3315, + "time_per_iteration": 2.8987390995025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122628, + "balance_loss_mlp": 1.10333991, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.08106722698037498, + "language_loss": 0.84028, + "learning_rate": 0.00030614264884457054, + "loss": 0.85150629, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.19274902, + "step": 3316, + "time_per_iteration": 2.671858787536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112572, + "balance_loss_mlp": 1.09383273, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.09520385776828669, + "language_loss": 0.77556765, + "learning_rate": 0.000305855513202655, + "loss": 0.78669333, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.18725586, + "step": 3317, + "time_per_iteration": 2.6103365421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105702, + "balance_loss_mlp": 1.08714104, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.0870793394439323, + "language_loss": 0.77407163, + "learning_rate": 0.0003055684529343138, + "loss": 0.78512859, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.18566895, + "step": 3318, + "time_per_iteration": 2.4441628456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104453, + "balance_loss_mlp": 1.08614254, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.09431837628284816, + "language_loss": 0.78623343, + "learning_rate": 0.00030528146815099374, + "loss": 0.79727793, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.1829834, + "step": 3319, + "time_per_iteration": 2.6380391120910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092806, + "balance_loss_mlp": 1.07459044, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.0775286688862043, + "language_loss": 0.7192508, + "learning_rate": 0.00030499455896411203, + "loss": 0.73017889, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.18225098, + "step": 3320, + "time_per_iteration": 2.6337239742279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146548, + "balance_loss_mlp": 1.13748848, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.05026445046140725, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77447361, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.09082031, + "step": 3321, + "time_per_iteration": 4.989959239959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080858, + "balance_loss_mlp": 1.06314373, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.29371403446084504, + "language_loss": 0.76736987, + "learning_rate": 0.0003044209678251865, + "loss": 0.77817845, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.17712402, + "step": 3322, + "time_per_iteration": 2.9107608795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_mlp": 1.05879879, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.07557324535671889, + "language_loss": 0.84569478, + "learning_rate": 0.0003041342860958306, + "loss": 0.85645002, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.1673584, + "step": 3323, + "time_per_iteration": 2.7665860652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010742, + "balance_loss_mlp": 1.0572598, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.11260284844343603, + "language_loss": 0.9165262, + "learning_rate": 0.00030384768040828857, + "loss": 0.92726815, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.16931152, + "step": 3324, + "time_per_iteration": 2.6840200424194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075186, + "balance_loss_mlp": 1.05894923, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.08385815306502278, + "language_loss": 0.85726339, + "learning_rate": 0.00030356115087383094, + "loss": 0.86801529, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.16235352, + "step": 3325, + "time_per_iteration": 2.685962200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071194, + "balance_loss_mlp": 1.0543381, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.07882318349260847, + "language_loss": 0.85086048, + "learning_rate": 0.00030327469760369803, + "loss": 0.86157244, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.16870117, + "step": 3326, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075003, + "balance_loss_mlp": 1.05855227, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.09362500195471922, + "language_loss": 0.84774464, + "learning_rate": 0.0003029883207091009, + "loss": 0.8584947, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.16455078, + "step": 3327, + "time_per_iteration": 2.7647178173065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080953, + "balance_loss_mlp": 1.06489587, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.0837002807607971, + "language_loss": 0.7833994, + "learning_rate": 0.00030270202030122095, + "loss": 0.794209, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.16052246, + "step": 3328, + "time_per_iteration": 2.6863620281219482 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 276947200, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7536755586367488.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/training_args.bin b/sft_pretrain/Full_smoe/checkpoint-3328/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..653f1069449711a96532c47aa7e98309fc667b64 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:095a9ba23d3697135bb3cbeedb8658076e6e5b9f463636ff05e424e8a9161ab6 +size 7992 diff --git a/sft_pretrain/Full_smoe/checkpoint-3328/zero_to_fp32.py b/sft_pretrain/Full_smoe/checkpoint-3328/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-3328/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/added_tokens.json b/sft_pretrain/Full_smoe/checkpoint-5198/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/config.json b/sft_pretrain/Full_smoe/checkpoint-5198/config.json new file mode 100644 index 0000000000000000000000000000000000000000..a5b687752b38ba994afe0584c1b87811b54cb708 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/generation_config.json b/sft_pretrain/Full_smoe/checkpoint-5198/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..543163d3490ac53e76540e9aa4f0bf73c9c96bf4 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0fc3f9d4afcbe79e106303b6e3e73d603bcc2e4977d6efca82a2377d95378de +size 396582032 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1ea2cac6915d5ed837a044ac392a2c38a63715a --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d35a373dcc15f55f3824f2126f2e7549a9a25eaa193c553d3f586a20b0b84fe +size 396582032 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..10dc3e587657b4cb220e4b8903718e3331e2a3a7 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be13e8db953c9cee03dcc064919e0fb3596d7637c828540fbf930ad80c8d551e +size 396582032 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a8852b51da273aeaf2dfd7a2aba9a877bd93a19 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3972b37ee3921bfa9452cc95979094f742c535660f46556b7cb183244692f507 +size 396582032 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..55ebdab8d7e3ece2800428e6f4e816fec862364c --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_0_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd0b095d777c49dd54a474bebfb0bc13ae1c917b2d08cbb90097b7d2dbba695 +size 2117321480 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4f890f8c67ee8be44e713fa86ef5db89574ecc8 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_1_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3d1957af01a2d851a1e7d585adc283475a8f5d0d39adea0c8fb2af93f0be304 +size 2117321480 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..b22f3d24122569bba3d13564588a8407d8e81624 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_2_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33b964e92bb39f2982f583e16ed0e28a6193acb00d143dfa06ab8dfac2c121f8 +size 2117321480 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8c11c399b08b8e8900333c5b92c6ec9bfb6548c --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/global_step5198/zero_pp_rank_3_mp_rank_00_model_states.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbee95eff387fcc228ce0d4f3aae2c846fe3897faf228bd6bff059bf03ffa976 +size 2117321480 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/latest b/sft_pretrain/Full_smoe/checkpoint-5198/latest new file mode 100644 index 0000000000000000000000000000000000000000..c0e63763d1d13a0ca7a3b62ff8f5cd1d69cc4978 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/latest @@ -0,0 +1 @@ +global_step5198 \ No newline at end of file diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe/checkpoint-5198/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe/checkpoint-5198/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e981ed7608d232dd8a7891b5ff88b3683fe200cc --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a47d1e574adf804d0e976f951ed4e76e2f15cfdc5dceca7c9c377f2462d65ca9 +size 3759025152 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/model.safetensors.index.json b/sft_pretrain/Full_smoe/checkpoint-5198/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_0.pth b/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef4849062bcdc8ffd2246c07673ba196a8d61a6d --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae2114fffe9b1eea30e28bbdb4ce59046b0079ea5b8dc4682079f609d49d787 +size 14960 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_1.pth b/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..2fcb2b640bc236c26aa841680d34a91240247970 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff5f3a53530ac868291e2667c8f824bfa1f4fa1ce880df8223a7165ef38e11 +size 14960 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_2.pth b/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00c3f989de00e6d58ca7345ae6f65fee0afcbdcd --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91f80a7779b0034e70106ba6cb0e3e686052334c20ce54453ee3977cc0219d15 +size 14960 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_3.pth b/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..f289913854ee3fa52a86e282421da07d85b8a4c4 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece3bc0d0e16c43ef245cc787cbd0d63d08d460f489c4cd52adf6501b9281a18 +size 14960 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/special_tokens_map.json b/sft_pretrain/Full_smoe/checkpoint-5198/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/tokenizer.model b/sft_pretrain/Full_smoe/checkpoint-5198/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/tokenizer_config.json b/sft_pretrain/Full_smoe/checkpoint-5198/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/trainer_state.json b/sft_pretrain/Full_smoe/checkpoint-5198/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ba01f38065ca6f6dd49dfe7b1f3405d041c3107a --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/trainer_state.json @@ -0,0 +1,78003 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.851083319408797, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 22.685314178466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018389, + "balance_loss_mlp": 1.26880157, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.607348306835714, + "language_loss": 2.4131012, + "learning_rate": 0.00013726078121135892, + "loss": 2.43328524, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.48828125, + "step": 2, + "time_per_iteration": 2.6085429191589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02035932, + "balance_loss_mlp": 1.28710687, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.480566200669728, + "language_loss": 2.12185097, + "learning_rate": 0.00021755319103969496, + "loss": 2.14221001, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.48046875, + "step": 3, + "time_per_iteration": 2.817356824874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02060169, + "balance_loss_mlp": 1.30028164, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 4.363008200765646, + "language_loss": 1.37660766, + "learning_rate": 0.00027452156242271784, + "loss": 1.39720929, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.59375, + "step": 4, + "time_per_iteration": 2.7677674293518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02070568, + "balance_loss_mlp": 1.31411338, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.9313793007477466, + "language_loss": 1.33924747, + "learning_rate": 0.0003187096642208417, + "loss": 1.35995317, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.649566650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02075998, + "balance_loss_mlp": 1.31763589, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.3251391322215498, + "language_loss": 1.31535721, + "learning_rate": 0.0003548139722510539, + "loss": 1.33611727, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.715332269668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02105134, + "balance_loss_mlp": 1.3406682, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.8930621517096357, + "language_loss": 1.22756648, + "learning_rate": 0.00038533972973918044, + "loss": 1.24861789, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.640625, + "step": 7, + "time_per_iteration": 2.620546340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02159823, + "balance_loss_mlp": 1.38276935, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.2913046553555926, + "language_loss": 1.17756534, + "learning_rate": 0.0004117823436340768, + "loss": 1.19916344, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6581108570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02180456, + "balance_loss_mlp": 1.39310265, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.2812491955312875, + "language_loss": 1.24828589, + "learning_rate": 0.00043510638207938993, + "loss": 1.27009046, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.859375, + "step": 9, + "time_per_iteration": 2.7921459674835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220606, + "balance_loss_mlp": 1.43058181, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.32786568158439683, + "language_loss": 1.14205348, + "learning_rate": 0.00045597044543220066, + "loss": 1.16425967, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.8984375, + "step": 10, + "time_per_iteration": 2.7258670330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0223461, + "balance_loss_mlp": 1.43886435, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.1860296084342833, + "language_loss": 1.11914992, + "learning_rate": 0.00047484428652143135, + "loss": 1.14149594, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.953125, + "step": 11, + "time_per_iteration": 2.907498359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235376, + "balance_loss_mlp": 1.4423002, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11947281146450546, + "language_loss": 1.17959428, + "learning_rate": 0.0004920747534624128, + "loss": 1.20194793, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.92578125, + "step": 12, + "time_per_iteration": 2.6528539657592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218147, + "balance_loss_mlp": 1.42507148, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.12512408660007263, + "language_loss": 1.20210767, + "learning_rate": 0.0005079252465375872, + "loss": 1.22428906, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.921875, + "step": 13, + "time_per_iteration": 2.8123886585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02214103, + "balance_loss_mlp": 1.42140937, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.16684445783545154, + "language_loss": 1.10100055, + "learning_rate": 0.0005226005109505393, + "loss": 1.12314165, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.9140625, + "step": 14, + "time_per_iteration": 2.628995180130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130591, + "balance_loss_mlp": 1.36459994, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.1391159076902598, + "language_loss": 1.15644169, + "learning_rate": 0.0005362628552605367, + "loss": 1.17774749, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.65234375, + "step": 15, + "time_per_iteration": 2.650690793991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02123252, + "balance_loss_mlp": 1.36260176, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12794674976623602, + "language_loss": 1.19969535, + "learning_rate": 0.0005490431248454357, + "loss": 1.22092795, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.6015625, + "step": 16, + "time_per_iteration": 2.7189841270446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0240823, + "balance_loss_mlp": 1.66054928, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2699272965631097, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78113341, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.46875, + "step": 17, + "time_per_iteration": 5.958680868148804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02007176, + "balance_loss_mlp": 1.28352785, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.08195889268296155, + "language_loss": 1.0631001, + "learning_rate": 0.0005723671632907488, + "loss": 1.08317184, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.23046875, + "step": 18, + "time_per_iteration": 2.633267879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01953804, + "balance_loss_mlp": 1.2572403, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11150538237586374, + "language_loss": 1.11837816, + "learning_rate": 0.0005830738490244919, + "loss": 1.13791621, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.526186466217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920231, + "balance_loss_mlp": 1.24464774, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.09041890124841255, + "language_loss": 1.13942695, + "learning_rate": 0.0005932312266435596, + "loss": 1.15862942, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.76171875, + "step": 20, + "time_per_iteration": 2.8158531188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01861181, + "balance_loss_mlp": 1.21687818, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1379829587383013, + "language_loss": 1.09075773, + "learning_rate": 0.0006028929207788754, + "loss": 1.10936952, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.44140625, + "step": 21, + "time_per_iteration": 2.7115283012390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01816904, + "balance_loss_mlp": 1.19815993, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09955042249077097, + "language_loss": 1.11992621, + "learning_rate": 0.0006121050677327902, + "loss": 1.13809526, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.1796875, + "step": 22, + "time_per_iteration": 2.9170944690704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01769897, + "balance_loss_mlp": 1.18281531, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.08735903991507939, + "language_loss": 1.03007698, + "learning_rate": 0.0006209076479463684, + "loss": 1.04777598, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.8671875, + "step": 23, + "time_per_iteration": 2.6403517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01733821, + "balance_loss_mlp": 1.17191648, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.08709154861799764, + "language_loss": 1.12691391, + "learning_rate": 0.0006293355346737718, + "loss": 1.14425218, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.62890625, + "step": 24, + "time_per_iteration": 2.706193208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01681551, + "balance_loss_mlp": 1.14711165, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08429969570703955, + "language_loss": 1.08894634, + "learning_rate": 0.0006374193284416834, + "loss": 1.10576177, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.34765625, + "step": 25, + "time_per_iteration": 2.788973808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01660379, + "balance_loss_mlp": 1.15416873, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.1402992304177309, + "language_loss": 1.07612705, + "learning_rate": 0.0006451860277489461, + "loss": 1.09273076, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.05859375, + "step": 26, + "time_per_iteration": 2.6577279567718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01646, + "balance_loss_mlp": 1.17107058, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.16239245775773925, + "language_loss": 1.14940214, + "learning_rate": 0.0006526595731190848, + "loss": 1.16586208, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.73828125, + "step": 27, + "time_per_iteration": 2.4788224697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586113, + "balance_loss_mlp": 1.1497122, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.0939755899328463, + "language_loss": 1.08969474, + "learning_rate": 0.0006598612921618983, + "loss": 1.10555601, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.375, + "step": 28, + "time_per_iteration": 2.8451075553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01530584, + "balance_loss_mlp": 1.12393713, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08153278055262643, + "language_loss": 1.02661419, + "learning_rate": 0.0006668102665011454, + "loss": 1.04191995, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 4.0703125, + "step": 29, + "time_per_iteration": 3.3112235069274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149795, + "balance_loss_mlp": 1.11743355, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.14907920412525114, + "language_loss": 1.11315072, + "learning_rate": 0.0006735236364718957, + "loss": 1.1281302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.80273438, + "step": 30, + "time_per_iteration": 2.744025945663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444605, + "balance_loss_mlp": 1.09651423, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.1454661106932218, + "language_loss": 1.10029531, + "learning_rate": 0.0006800168558381346, + "loss": 1.11474133, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.484375, + "step": 31, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408917, + "balance_loss_mlp": 1.08962691, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.13886710462644744, + "language_loss": 1.12821865, + "learning_rate": 0.0006863039060567947, + "loss": 1.14230776, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.19140625, + "step": 32, + "time_per_iteration": 2.778316020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386345, + "balance_loss_mlp": 1.0916599, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.0950000822083296, + "language_loss": 1.06182003, + "learning_rate": 0.0006923974775611263, + "loss": 1.07568347, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.94726562, + "step": 33, + "time_per_iteration": 2.822932243347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377092, + "balance_loss_mlp": 1.10586727, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0933492164101247, + "language_loss": 1.02986193, + "learning_rate": 0.0006983091239737814, + "loss": 1.04363275, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.71484375, + "step": 34, + "time_per_iteration": 3.030482530593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01362684, + "balance_loss_mlp": 1.11224914, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.11255876729792032, + "language_loss": 1.0177412, + "learning_rate": 0.0007040493939600222, + "loss": 1.03136802, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.50195312, + "step": 35, + "time_per_iteration": 2.849836826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339928, + "balance_loss_mlp": 1.10723162, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.05318315286224845, + "language_loss": 1.02413034, + "learning_rate": 0.0007096279445021078, + "loss": 1.03752947, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.32421875, + "step": 36, + "time_per_iteration": 2.7724404335021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333933, + "balance_loss_mlp": 1.12202668, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09673231095327042, + "language_loss": 1.09330344, + "learning_rate": 0.0007150536386503726, + "loss": 1.10664272, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 2.12304688, + "step": 37, + "time_per_iteration": 2.87898588180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131197, + "balance_loss_mlp": 1.11932778, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.1501774474729275, + "language_loss": 1.02011764, + "learning_rate": 0.0007203346302358509, + "loss": 1.03323734, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.92578125, + "step": 38, + "time_per_iteration": 2.9664244651794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301697, + "balance_loss_mlp": 1.11916423, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.13354549864664766, + "language_loss": 1.06722176, + "learning_rate": 0.000725478437577282, + "loss": 1.08023882, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.82324219, + "step": 39, + "time_per_iteration": 2.8403327465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_mlp": 1.10262501, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.06892008670865749, + "language_loss": 1.01746094, + "learning_rate": 0.0007304920078549186, + "loss": 1.03015804, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.671875, + "step": 40, + "time_per_iteration": 2.7219579219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271556, + "balance_loss_mlp": 1.1131506, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.1603294487215327, + "language_loss": 1.03720689, + "learning_rate": 0.0007353817735343603, + "loss": 1.04992247, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.58300781, + "step": 41, + "time_per_iteration": 2.7060108184814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246674, + "balance_loss_mlp": 1.10390913, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.0511848053751201, + "language_loss": 0.99442279, + "learning_rate": 0.0007401537019902344, + "loss": 1.00688958, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.42871094, + "step": 42, + "time_per_iteration": 2.633784294128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227359, + "balance_loss_mlp": 1.0990901, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.10374794700504324, + "language_loss": 1.02897811, + "learning_rate": 0.0007448133392900729, + "loss": 1.04125178, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.28222656, + "step": 43, + "time_per_iteration": 2.7279117107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123262, + "balance_loss_mlp": 1.11207604, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.09096864884609944, + "language_loss": 0.98755985, + "learning_rate": 0.0007493658489441491, + "loss": 0.99988604, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.20410156, + "step": 44, + "time_per_iteration": 2.8941659927368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217323, + "balance_loss_mlp": 1.10812736, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.11598802445363406, + "language_loss": 1.0210619, + "learning_rate": 0.0007538160463002316, + "loss": 1.03323507, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.09375, + "step": 45, + "time_per_iteration": 2.7019526958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.11510944, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.06911757836433406, + "language_loss": 1.05356646, + "learning_rate": 0.0007581684291577274, + "loss": 1.06572652, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.00927734, + "step": 46, + "time_per_iteration": 2.5990471839904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209164, + "balance_loss_mlp": 1.11603808, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.09057982339816145, + "language_loss": 1.08819616, + "learning_rate": 0.0007624272050891776, + "loss": 1.10028791, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.93066406, + "step": 47, + "time_per_iteration": 2.8298892974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175031, + "balance_loss_mlp": 1.09315765, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06662076278867826, + "language_loss": 0.98563552, + "learning_rate": 0.0007665963158851307, + "loss": 0.99738586, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.81884766, + "step": 48, + "time_per_iteration": 2.840701103210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175621, + "balance_loss_mlp": 1.10109115, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07605871591802618, + "language_loss": 1.06984305, + "learning_rate": 0.0007706794594783609, + "loss": 1.08159924, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.74511719, + "step": 49, + "time_per_iteration": 2.7622482776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171173, + "balance_loss_mlp": 1.10093522, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.048657988043197084, + "language_loss": 1.05961394, + "learning_rate": 0.0007746801096530423, + "loss": 1.07132566, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.703125, + "step": 50, + "time_per_iteration": 2.768888473510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173086, + "balance_loss_mlp": 1.10890365, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.10082721582881933, + "language_loss": 1.10655856, + "learning_rate": 0.0007786015338021173, + "loss": 1.11828947, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.64160156, + "step": 51, + "time_per_iteration": 2.6473164558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155225, + "balance_loss_mlp": 1.09590614, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0966315307988203, + "language_loss": 1.03207719, + "learning_rate": 0.0007824468089603051, + "loss": 1.04362941, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.59277344, + "step": 52, + "time_per_iteration": 2.6773018836975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011599, + "balance_loss_mlp": 1.10766244, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.060495503821074374, + "language_loss": 1.02858949, + "learning_rate": 0.0007862188363098669, + "loss": 1.04018843, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52319336, + "step": 53, + "time_per_iteration": 3.174023389816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150276, + "balance_loss_mlp": 1.10125709, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.08315393852704078, + "language_loss": 1.03287244, + "learning_rate": 0.0007899203543304438, + "loss": 1.04437518, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48974609, + "step": 54, + "time_per_iteration": 2.7804617881774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158937, + "balance_loss_mlp": 1.11192107, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.13140028768126893, + "language_loss": 1.16694331, + "learning_rate": 0.0007935539507422731, + "loss": 1.1785326, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.47021484, + "step": 55, + "time_per_iteration": 2.6466386318206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137235, + "balance_loss_mlp": 1.09496331, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.06179334078386534, + "language_loss": 1.08511901, + "learning_rate": 0.0007971220733732573, + "loss": 1.09649134, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42285156, + "step": 56, + "time_per_iteration": 2.7039074897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138761, + "balance_loss_mlp": 1.10166252, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08220293288244152, + "language_loss": 1.03500617, + "learning_rate": 0.0008006270400641869, + "loss": 1.04639375, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.37084961, + "step": 57, + "time_per_iteration": 2.7175657749176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113224, + "balance_loss_mlp": 1.0981698, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.07093854356308794, + "language_loss": 1.04580712, + "learning_rate": 0.0008040710477125043, + "loss": 1.0571295, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.34106445, + "step": 58, + "time_per_iteration": 2.7424120903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135189, + "balance_loss_mlp": 1.10312176, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.07916475402820797, + "language_loss": 1.05395138, + "learning_rate": 0.0008074561805429771, + "loss": 1.06530333, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.32055664, + "step": 59, + "time_per_iteration": 2.7407617568969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130551, + "balance_loss_mlp": 1.10155916, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.04727062297709066, + "language_loss": 1.03273892, + "learning_rate": 0.0008107844176832545, + "loss": 1.04404449, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.28979492, + "step": 60, + "time_per_iteration": 2.6854803562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141191, + "balance_loss_mlp": 1.11353481, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.0952762711916136, + "language_loss": 1.04648042, + "learning_rate": 0.0008140576401132568, + "loss": 1.05789232, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.27685547, + "step": 61, + "time_per_iteration": 2.6589457988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137564, + "balance_loss_mlp": 1.11303091, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.07958403959823916, + "language_loss": 1.06014252, + "learning_rate": 0.0008172776370494935, + "loss": 1.07151818, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.24536133, + "step": 62, + "time_per_iteration": 2.768505334854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112985, + "balance_loss_mlp": 1.10548401, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.09183855716602674, + "language_loss": 1.12897038, + "learning_rate": 0.0008204461118185703, + "loss": 1.14026892, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.24353027, + "step": 63, + "time_per_iteration": 2.5573627948760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130628, + "balance_loss_mlp": 1.10793018, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.09747713298414284, + "language_loss": 1.02471447, + "learning_rate": 0.0008235646872681536, + "loss": 1.03602076, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22692871, + "step": 64, + "time_per_iteration": 2.585127353668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127769, + "balance_loss_mlp": 1.10554826, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.10571288349070412, + "language_loss": 1.02039421, + "learning_rate": 0.0008266349107584288, + "loss": 1.03167176, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2220459, + "step": 65, + "time_per_iteration": 2.703620433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140513, + "balance_loss_mlp": 1.11872149, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.14637461076762864, + "language_loss": 1.05036354, + "learning_rate": 0.0008296582587724851, + "loss": 1.06176865, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21801758, + "step": 66, + "time_per_iteration": 2.728839159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121296, + "balance_loss_mlp": 1.09962404, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.10157918798152736, + "language_loss": 1.03485751, + "learning_rate": 0.0008326361411800136, + "loss": 1.04607058, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21704102, + "step": 67, + "time_per_iteration": 2.963634729385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119673, + "balance_loss_mlp": 1.09863222, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.046087363126128704, + "language_loss": 1.03369427, + "learning_rate": 0.0008355699051851403, + "loss": 1.044891, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.21057129, + "step": 68, + "time_per_iteration": 2.7779767513275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146637, + "balance_loss_mlp": 1.12541735, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.10078437623262682, + "language_loss": 1.10584092, + "learning_rate": 0.0008384608389860635, + "loss": 1.11730719, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.21228027, + "step": 69, + "time_per_iteration": 2.72163724899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158467, + "balance_loss_mlp": 1.13795137, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.07269171982270876, + "language_loss": 1.00728607, + "learning_rate": 0.000841310175171381, + "loss": 1.01887083, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.2052002, + "step": 70, + "time_per_iteration": 2.653019666671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157048, + "balance_loss_mlp": 1.13693786, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.09340958478170322, + "language_loss": 0.98922431, + "learning_rate": 0.000844119093875517, + "loss": 1.00079489, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2010498, + "step": 71, + "time_per_iteration": 2.722351551055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152491, + "balance_loss_mlp": 1.13224936, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08018714642813927, + "language_loss": 1.04454517, + "learning_rate": 0.0008468887257134666, + "loss": 1.05607009, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.20239258, + "step": 72, + "time_per_iteration": 2.7619922161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134825, + "balance_loss_mlp": 1.11441696, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07872027680195416, + "language_loss": 1.06334233, + "learning_rate": 0.0008496201545131264, + "loss": 1.07469058, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.20410156, + "step": 73, + "time_per_iteration": 2.7532896995544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135406, + "balance_loss_mlp": 1.11529493, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.07696506497764126, + "language_loss": 1.03964853, + "learning_rate": 0.0008523144198617317, + "loss": 1.0510025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.20092773, + "step": 74, + "time_per_iteration": 3.220428943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_mlp": 1.11140466, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.08624589903961616, + "language_loss": 1.03597379, + "learning_rate": 0.0008549725194813783, + "loss": 1.04729605, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20825195, + "step": 75, + "time_per_iteration": 2.6929681301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126542, + "balance_loss_mlp": 1.1071701, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.1408086440247197, + "language_loss": 1.02827942, + "learning_rate": 0.0008575954114472099, + "loss": 1.03954494, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.19360352, + "step": 76, + "time_per_iteration": 3.1799752712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139543, + "balance_loss_mlp": 1.12005258, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.07592994584910524, + "language_loss": 1.00451732, + "learning_rate": 0.0008601840162606118, + "loss": 1.01591277, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.19470215, + "step": 77, + "time_per_iteration": 3.0833282470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138555, + "balance_loss_mlp": 1.11827779, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.08431129228371863, + "language_loss": 1.0643971, + "learning_rate": 0.000862739218788641, + "loss": 1.07578266, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.20275879, + "step": 78, + "time_per_iteration": 2.8568053245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141293, + "balance_loss_mlp": 1.121135, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.11686764405791189, + "language_loss": 1.04346561, + "learning_rate": 0.0008652618700799138, + "loss": 1.05487859, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.20153809, + "step": 79, + "time_per_iteration": 2.6828417778015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144733, + "balance_loss_mlp": 1.12453914, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.10817734170715895, + "language_loss": 1.03413367, + "learning_rate": 0.0008677527890662774, + "loss": 1.0455811, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.2019043, + "step": 80, + "time_per_iteration": 2.4982268810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142756, + "balance_loss_mlp": 1.12232339, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.09792899658664883, + "language_loss": 1.04667735, + "learning_rate": 0.0008702127641587799, + "loss": 1.05810475, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20422363, + "step": 81, + "time_per_iteration": 2.7113406658172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136455, + "balance_loss_mlp": 1.11561751, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.10099710945407976, + "language_loss": 1.00204504, + "learning_rate": 0.0008726425547457192, + "loss": 1.01340961, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20825195, + "step": 82, + "time_per_iteration": 2.8304948806762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140166, + "balance_loss_mlp": 1.12054384, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.11260184265628481, + "language_loss": 0.99513066, + "learning_rate": 0.0008750428925998964, + "loss": 1.00653231, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.19604492, + "step": 83, + "time_per_iteration": 2.762498617172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147828, + "balance_loss_mlp": 1.12830114, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.1180796768296156, + "language_loss": 1.05058432, + "learning_rate": 0.0008774144832015932, + "loss": 1.06206274, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.19519043, + "step": 84, + "time_per_iteration": 2.749310255050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01971265, + "balance_loss_mlp": 1.95724583, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.4228509486674634, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7674557, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.140625, + "step": 85, + "time_per_iteration": 4.626708745956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137886, + "balance_loss_mlp": 1.11834753, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.09445954258913132, + "language_loss": 1.0054847, + "learning_rate": 0.0008820741205014318, + "loss": 1.01686358, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1953125, + "step": 86, + "time_per_iteration": 2.918696403503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145479, + "balance_loss_mlp": 1.12540436, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.14940142735495454, + "language_loss": 1.02554607, + "learning_rate": 0.0008843634575408404, + "loss": 1.03700089, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20068359, + "step": 87, + "time_per_iteration": 2.6972436904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140944, + "balance_loss_mlp": 1.12226439, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.07729580722494055, + "language_loss": 1.03912258, + "learning_rate": 0.0008866266301555082, + "loss": 1.0505321, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18676758, + "step": 88, + "time_per_iteration": 2.741374969482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164989, + "balance_loss_mlp": 1.14647579, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.12135336715529384, + "language_loss": 1.04746294, + "learning_rate": 0.0008888642296509615, + "loss": 1.05911291, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18493652, + "step": 89, + "time_per_iteration": 2.62099552154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183021, + "balance_loss_mlp": 1.16370893, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.13101998707614188, + "language_loss": 1.08785903, + "learning_rate": 0.0008910768275115906, + "loss": 1.09968925, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.19311523, + "step": 90, + "time_per_iteration": 2.819420099258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181615, + "balance_loss_mlp": 1.16215992, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.1050230223941115, + "language_loss": 1.04935551, + "learning_rate": 0.0008932649762767675, + "loss": 1.06117165, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19445801, + "step": 91, + "time_per_iteration": 2.622406244277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182279, + "balance_loss_mlp": 1.16277599, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.08683374673810437, + "language_loss": 1.07276869, + "learning_rate": 0.0008954292103690864, + "loss": 1.08459151, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.19494629, + "step": 92, + "time_per_iteration": 2.9198801517486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185402, + "balance_loss_mlp": 1.16578054, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.18507766534910622, + "language_loss": 1.0957979, + "learning_rate": 0.0008975700468778296, + "loss": 1.10765195, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19616699, + "step": 93, + "time_per_iteration": 2.6395699977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183129, + "balance_loss_mlp": 1.1639359, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.14308809926092464, + "language_loss": 1.0301311, + "learning_rate": 0.0008996879863005366, + "loss": 1.04196239, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.19189453, + "step": 94, + "time_per_iteration": 2.685325860977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192741, + "balance_loss_mlp": 1.17335784, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.08942422865693514, + "language_loss": 1.02994668, + "learning_rate": 0.0009017835132453337, + "loss": 1.04187417, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19360352, + "step": 95, + "time_per_iteration": 2.640179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.1659379, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.12775293798220247, + "language_loss": 1.03491902, + "learning_rate": 0.0009038570970964896, + "loss": 1.046772, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.19348145, + "step": 96, + "time_per_iteration": 2.8062894344329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153754, + "balance_loss_mlp": 1.13440657, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.07493462569205835, + "language_loss": 1.00418913, + "learning_rate": 0.0009059091926454854, + "loss": 1.01572669, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.1932373, + "step": 97, + "time_per_iteration": 2.625839948654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147507, + "balance_loss_mlp": 1.12845731, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09820444328466757, + "language_loss": 0.99835473, + "learning_rate": 0.0009079402406897198, + "loss": 1.00982976, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19042969, + "step": 98, + "time_per_iteration": 3.2515511512756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153758, + "balance_loss_mlp": 1.13449359, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.1057784840893083, + "language_loss": 1.01116824, + "learning_rate": 0.0009099506686008212, + "loss": 1.02270579, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19262695, + "step": 99, + "time_per_iteration": 2.8564164638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131688, + "balance_loss_mlp": 1.11337709, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.06422494393503501, + "language_loss": 1.04474521, + "learning_rate": 0.0009119408908644013, + "loss": 1.0560621, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.18310547, + "step": 100, + "time_per_iteration": 2.717921495437622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126129, + "balance_loss_mlp": 1.10765147, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.13157053780302536, + "language_loss": 1.09764636, + "learning_rate": 0.0009139113095929519, + "loss": 1.1089077, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18469238, + "step": 101, + "time_per_iteration": 2.8778345584869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.12801814, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.09138386946816152, + "language_loss": 1.03731561, + "learning_rate": 0.0009158623150134762, + "loss": 1.04879129, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19543457, + "step": 102, + "time_per_iteration": 2.588974952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127418, + "balance_loss_mlp": 1.10807002, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.09239208832300977, + "language_loss": 1.03516126, + "learning_rate": 0.000917794285931332, + "loss": 1.04643536, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.1932373, + "step": 103, + "time_per_iteration": 2.680100917816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126804, + "balance_loss_mlp": 1.10709858, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06521053042835766, + "language_loss": 0.95701432, + "learning_rate": 0.0009197075901716639, + "loss": 0.96828234, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.19689941, + "step": 104, + "time_per_iteration": 2.730409860610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154929, + "balance_loss_mlp": 1.13441312, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.1079045695165621, + "language_loss": 1.06002212, + "learning_rate": 0.0009216025849997171, + "loss": 1.07157135, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.2052002, + "step": 105, + "time_per_iteration": 2.8010010719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125545, + "balance_loss_mlp": 1.10562515, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.06774775888709755, + "language_loss": 1.00999045, + "learning_rate": 0.0009234796175212258, + "loss": 1.02124596, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.19909668, + "step": 106, + "time_per_iteration": 3.0094785690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.11433506, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.09956588263492473, + "language_loss": 1.04219186, + "learning_rate": 0.000925339025064007, + "loss": 1.05353272, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.1973877, + "step": 107, + "time_per_iteration": 2.9836714267730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112225, + "balance_loss_mlp": 1.1024735, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06168154311284234, + "language_loss": 0.97232246, + "learning_rate": 0.0009271811355418027, + "loss": 0.98354501, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19775391, + "step": 108, + "time_per_iteration": 2.860042095184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120623, + "balance_loss_mlp": 1.10089409, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.10451884090896614, + "language_loss": 1.03835416, + "learning_rate": 0.0009290062678013548, + "loss": 1.04956043, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19714355, + "step": 109, + "time_per_iteration": 2.8912689685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116129, + "balance_loss_mlp": 1.09641171, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.12087494450090952, + "language_loss": 1.02292705, + "learning_rate": 0.0009308147319536321, + "loss": 1.03408837, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19702148, + "step": 110, + "time_per_iteration": 2.682143449783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123774, + "balance_loss_mlp": 1.10437846, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.09468465669548881, + "language_loss": 1.08714509, + "learning_rate": 0.0009326068296900676, + "loss": 1.09838271, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.19372559, + "step": 111, + "time_per_iteration": 2.8420276641845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113573, + "balance_loss_mlp": 1.09368885, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06573635575260657, + "language_loss": 1.00160766, + "learning_rate": 0.0009343828545846161, + "loss": 1.01274335, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19873047, + "step": 112, + "time_per_iteration": 2.81919264793396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140317, + "balance_loss_mlp": 1.1205641, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.10387186502959084, + "language_loss": 1.03632593, + "learning_rate": 0.0009361430923823841, + "loss": 1.04772925, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1973877, + "step": 113, + "time_per_iteration": 2.6119744777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125249, + "balance_loss_mlp": 1.1051383, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.07902170601711563, + "language_loss": 1.07192981, + "learning_rate": 0.0009378878212755459, + "loss": 1.08318233, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.2010498, + "step": 114, + "time_per_iteration": 2.511798143386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121266, + "balance_loss_mlp": 1.10053515, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.07803352047235128, + "language_loss": 0.97866738, + "learning_rate": 0.0009396173121672103, + "loss": 0.98988008, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20739746, + "step": 115, + "time_per_iteration": 2.664508819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129328, + "balance_loss_mlp": 1.10866928, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.0856857268925464, + "language_loss": 1.03136635, + "learning_rate": 0.0009413318289238633, + "loss": 1.04265964, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20666504, + "step": 116, + "time_per_iteration": 2.78078031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107565, + "balance_loss_mlp": 1.08696532, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.07931864844683259, + "language_loss": 0.9541564, + "learning_rate": 0.0009430316286169771, + "loss": 0.96523207, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20605469, + "step": 117, + "time_per_iteration": 3.034813404083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162526, + "balance_loss_mlp": 1.14062762, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.10907247817659571, + "language_loss": 1.00617993, + "learning_rate": 0.0009447169617543361, + "loss": 1.0178051, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.21899414, + "step": 118, + "time_per_iteration": 2.6340808868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173058, + "balance_loss_mlp": 1.15192246, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.12286397781369558, + "language_loss": 1.06791735, + "learning_rate": 0.0009463880725016029, + "loss": 1.0796479, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.21142578, + "step": 119, + "time_per_iteration": 2.7167999744415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112992, + "balance_loss_mlp": 1.10922527, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.1818455397825579, + "language_loss": 1.0306797, + "learning_rate": 0.0009480451988946134, + "loss": 1.04197884, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.20703125, + "step": 120, + "time_per_iteration": 2.8320834636688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127416, + "balance_loss_mlp": 1.10706663, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.061341545621049966, + "language_loss": 1.03699958, + "learning_rate": 0.0009496885730428627, + "loss": 1.04827368, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.20349121, + "step": 121, + "time_per_iteration": 3.0393545627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141944, + "balance_loss_mlp": 1.12239408, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.12547580017264032, + "language_loss": 1.01912796, + "learning_rate": 0.0009513184213246156, + "loss": 1.0305475, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19543457, + "step": 122, + "time_per_iteration": 2.651719093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162278, + "balance_loss_mlp": 1.14191747, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.1065762842559702, + "language_loss": 1.05289114, + "learning_rate": 0.0009529349645740552, + "loss": 1.06451392, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20361328, + "step": 123, + "time_per_iteration": 2.705214262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165232, + "balance_loss_mlp": 1.14444137, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.12380591024493681, + "language_loss": 1.04425788, + "learning_rate": 0.0009545384182608524, + "loss": 1.05591035, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.2076416, + "step": 124, + "time_per_iteration": 2.544631004333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143364, + "balance_loss_mlp": 1.12262154, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.07613482272455964, + "language_loss": 1.01444972, + "learning_rate": 0.0009561289926625252, + "loss": 1.0258832, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20739746, + "step": 125, + "time_per_iteration": 2.6732449531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140968, + "balance_loss_mlp": 1.11927211, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.13062804118014867, + "language_loss": 1.05952811, + "learning_rate": 0.0009577068930299292, + "loss": 1.07093775, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.21691895, + "step": 126, + "time_per_iteration": 2.5860514640808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111442, + "balance_loss_mlp": 1.09249783, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.11550485665133546, + "language_loss": 1.01208651, + "learning_rate": 0.0009592723197462087, + "loss": 1.02323079, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.21923828, + "step": 127, + "time_per_iteration": 2.680792808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139077, + "balance_loss_mlp": 1.1162957, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07531268866570652, + "language_loss": 0.98376709, + "learning_rate": 0.0009608254684795125, + "loss": 0.99515784, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.2277832, + "step": 128, + "time_per_iteration": 2.962553024291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151675, + "balance_loss_mlp": 1.12746358, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.10067493874109901, + "language_loss": 1.01099372, + "learning_rate": 0.0009623665303297678, + "loss": 1.02251053, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.24206543, + "step": 129, + "time_per_iteration": 2.7238845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178581, + "balance_loss_mlp": 1.1552279, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.11648498824832396, + "language_loss": 1.04954159, + "learning_rate": 0.0009638956919697878, + "loss": 1.06132734, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.23352051, + "step": 130, + "time_per_iteration": 2.878931999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180963, + "balance_loss_mlp": 1.15737128, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07835178368021106, + "language_loss": 0.97041726, + "learning_rate": 0.0009654131357809714, + "loss": 0.98222685, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.23596191, + "step": 131, + "time_per_iteration": 2.646268367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187479, + "balance_loss_mlp": 1.1633389, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.08592709100669786, + "language_loss": 1.06445599, + "learning_rate": 0.0009669190399838441, + "loss": 1.07633078, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.24169922, + "step": 132, + "time_per_iteration": 3.1253442764282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178278, + "balance_loss_mlp": 1.15288627, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.06433616224475917, + "language_loss": 0.99044776, + "learning_rate": 0.0009684135787636724, + "loss": 1.00223053, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.25402832, + "step": 133, + "time_per_iteration": 2.831838846206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193803, + "balance_loss_mlp": 1.16735041, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.10671106752503096, + "language_loss": 1.03402495, + "learning_rate": 0.0009698969223913726, + "loss": 1.04596305, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.26452637, + "step": 134, + "time_per_iteration": 3.0395402908325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167127, + "balance_loss_mlp": 1.14202118, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.08439936893253437, + "language_loss": 1.06654739, + "learning_rate": 0.0009713692373399265, + "loss": 1.0782187, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.25109863, + "step": 135, + "time_per_iteration": 2.7715206146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463238, + "balance_loss_mlp": 1.43119502, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.13141202298162255, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80919468, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.3203125, + "step": 136, + "time_per_iteration": 5.708434820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01366397, + "balance_loss_mlp": 1.3391223, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.10789098637796743, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79177433, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.2734375, + "step": 137, + "time_per_iteration": 4.936312198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164239, + "balance_loss_mlp": 1.14192283, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.07737815407023008, + "language_loss": 0.99685001, + "learning_rate": 0.0009757216201974225, + "loss": 1.00849247, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.22338867, + "step": 138, + "time_per_iteration": 2.848794460296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186286, + "balance_loss_mlp": 1.16373122, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.07356199280990307, + "language_loss": 1.04477906, + "learning_rate": 0.0009771514130396581, + "loss": 1.05664206, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.22546387, + "step": 139, + "time_per_iteration": 2.735100746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191574, + "balance_loss_mlp": 1.17103469, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09793912671864533, + "language_loss": 1.04422235, + "learning_rate": 0.00097857095638274, + "loss": 1.05613816, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20544434, + "step": 140, + "time_per_iteration": 2.6398932933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187088, + "balance_loss_mlp": 1.16559434, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.08846308668893199, + "language_loss": 0.95874435, + "learning_rate": 0.0009799803961288726, + "loss": 0.97061527, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.21484375, + "step": 141, + "time_per_iteration": 3.0505003929138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160228, + "balance_loss_mlp": 1.13921118, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.09598553540990232, + "language_loss": 1.0168581, + "learning_rate": 0.000981379875086876, + "loss": 1.02846038, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.21020508, + "step": 142, + "time_per_iteration": 3.0870697498321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143434, + "balance_loss_mlp": 1.12091553, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08800286540083159, + "language_loss": 0.96917391, + "learning_rate": 0.0009827695330590185, + "loss": 0.98060828, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.2253418, + "step": 143, + "time_per_iteration": 2.719317674636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128989, + "balance_loss_mlp": 1.10631514, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09792527853337853, + "language_loss": 0.96426451, + "learning_rate": 0.0009841495069248256, + "loss": 0.97555441, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.22692871, + "step": 144, + "time_per_iteration": 3.014765739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011276, + "balance_loss_mlp": 1.10584438, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06966533855263184, + "language_loss": 0.95713264, + "learning_rate": 0.0009855199307219871, + "loss": 0.9684087, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.21777344, + "step": 145, + "time_per_iteration": 2.6709253787994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148016, + "balance_loss_mlp": 1.12558043, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.09436929899226476, + "language_loss": 0.97337723, + "learning_rate": 0.0009868809357244854, + "loss": 0.98485744, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.22424316, + "step": 146, + "time_per_iteration": 2.669283390045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119223, + "balance_loss_mlp": 1.09726429, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.0790038702862921, + "language_loss": 1.01669443, + "learning_rate": 0.0009882326505180556, + "loss": 1.02788651, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21948242, + "step": 147, + "time_per_iteration": 2.704292058944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138866, + "balance_loss_mlp": 1.11706281, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.10005424592603226, + "language_loss": 0.99863935, + "learning_rate": 0.0009895752010730906, + "loss": 1.010028, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21801758, + "step": 148, + "time_per_iteration": 2.9581809043884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113837, + "balance_loss_mlp": 1.09122324, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.059614583623975884, + "language_loss": 1.06015503, + "learning_rate": 0.0009909087108150867, + "loss": 1.07129347, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.22619629, + "step": 149, + "time_per_iteration": 2.741159200668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121459, + "balance_loss_mlp": 1.09761691, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.11202611832617501, + "language_loss": 1.06769323, + "learning_rate": 0.0009922333006927371, + "loss": 1.07890773, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.23852539, + "step": 150, + "time_per_iteration": 2.4982028007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130563, + "balance_loss_mlp": 1.10591054, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07605307561327067, + "language_loss": 1.00263429, + "learning_rate": 0.0009935490892437632, + "loss": 1.01393986, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.2467041, + "step": 151, + "time_per_iteration": 2.603449583053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144852, + "balance_loss_mlp": 1.12064028, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10272840367827417, + "language_loss": 0.98558784, + "learning_rate": 0.0009948561926585687, + "loss": 0.99703634, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.24206543, + "step": 152, + "time_per_iteration": 2.8270881175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152248, + "balance_loss_mlp": 1.12610579, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09161667027770293, + "language_loss": 1.02430511, + "learning_rate": 0.0009961547248418122, + "loss": 1.03582752, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.26159668, + "step": 153, + "time_per_iteration": 2.6539955139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145984, + "balance_loss_mlp": 1.11949599, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.12801957864517624, + "language_loss": 0.99122071, + "learning_rate": 0.0009974447974719707, + "loss": 1.00268054, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.26477051, + "step": 154, + "time_per_iteration": 2.7382068634033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149383, + "balance_loss_mlp": 1.12209582, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.08800353648973465, + "language_loss": 1.01358569, + "learning_rate": 0.0009987265200589763, + "loss": 1.02507949, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.27307129, + "step": 155, + "time_per_iteration": 2.7484042644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146509, + "balance_loss_mlp": 1.11942446, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.06940376161599653, + "language_loss": 1.00859666, + "learning_rate": 0.001, + "loss": 1.02006161, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.27124023, + "step": 156, + "time_per_iteration": 2.9298081398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143379, + "balance_loss_mlp": 1.11696208, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07290625745558146, + "language_loss": 0.98239183, + "learning_rate": 0.0009999999029413921, + "loss": 0.99382555, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2644043, + "step": 157, + "time_per_iteration": 2.8521509170532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143494, + "balance_loss_mlp": 1.11851931, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.10227720632383495, + "language_loss": 0.99759698, + "learning_rate": 0.0009999996117656068, + "loss": 1.00903201, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.24975586, + "step": 158, + "time_per_iteration": 2.7299323081970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132124, + "balance_loss_mlp": 1.10562384, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.10099325970079884, + "language_loss": 0.93055141, + "learning_rate": 0.0009999991264727564, + "loss": 0.94187272, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.26489258, + "step": 159, + "time_per_iteration": 2.838892698287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115378, + "balance_loss_mlp": 1.08908033, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.07019569540009855, + "language_loss": 1.04060161, + "learning_rate": 0.0009999984470630296, + "loss": 1.05175543, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.26330566, + "step": 160, + "time_per_iteration": 2.6468522548675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127141, + "balance_loss_mlp": 1.09948444, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.11009923170951091, + "language_loss": 0.93022841, + "learning_rate": 0.0009999975735366902, + "loss": 0.94149983, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.27636719, + "step": 161, + "time_per_iteration": 3.0944836139678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113017, + "balance_loss_mlp": 1.10256159, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.1021195191465028, + "language_loss": 0.94580781, + "learning_rate": 0.0009999965058940775, + "loss": 0.95710957, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.27624512, + "step": 162, + "time_per_iteration": 3.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140975, + "balance_loss_mlp": 1.11293721, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.10168672994339449, + "language_loss": 1.00657988, + "learning_rate": 0.0009999952441356057, + "loss": 1.01798964, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.28027344, + "step": 163, + "time_per_iteration": 2.5260584354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117443, + "balance_loss_mlp": 1.09220648, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.059509842301402785, + "language_loss": 1.01277101, + "learning_rate": 0.000999993788261765, + "loss": 1.02394545, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.25231934, + "step": 164, + "time_per_iteration": 3.585451126098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117278, + "balance_loss_mlp": 1.09226811, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.08345282656088489, + "language_loss": 1.02586234, + "learning_rate": 0.00099999213827312, + "loss": 1.03703511, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.25036621, + "step": 165, + "time_per_iteration": 2.815709352493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126749, + "balance_loss_mlp": 1.10315728, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.06906572503593703, + "language_loss": 0.97404492, + "learning_rate": 0.000999990294170312, + "loss": 0.98531234, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.23596191, + "step": 166, + "time_per_iteration": 2.663247585296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123432, + "balance_loss_mlp": 1.09961414, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06114993163800343, + "language_loss": 1.01775765, + "learning_rate": 0.0009999882559540566, + "loss": 1.02899194, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.23803711, + "step": 167, + "time_per_iteration": 2.6779284477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113429, + "balance_loss_mlp": 1.08983719, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.051224230506458926, + "language_loss": 0.98247135, + "learning_rate": 0.000999986023625145, + "loss": 0.99360555, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23571777, + "step": 168, + "time_per_iteration": 2.8207764625549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02968107, + "balance_loss_mlp": 2.92347527, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.42377736764400964, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.81892526, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.44726562, + "step": 169, + "time_per_iteration": 5.030913591384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110197, + "balance_loss_mlp": 1.08739257, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11749522299339567, + "language_loss": 0.99391603, + "learning_rate": 0.0009999809766328958, + "loss": 1.005018, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.22839355, + "step": 170, + "time_per_iteration": 2.7288546562194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138899, + "balance_loss_mlp": 1.11526036, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.10262090882217431, + "language_loss": 1.01489758, + "learning_rate": 0.0009999781619715177, + "loss": 1.02628672, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23620605, + "step": 171, + "time_per_iteration": 2.57743239402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152618, + "balance_loss_mlp": 1.12847793, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.09929883602390663, + "language_loss": 1.00886559, + "learning_rate": 0.000999975153201402, + "loss": 1.0203917, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.24121094, + "step": 172, + "time_per_iteration": 2.8398427963256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164032, + "balance_loss_mlp": 1.13917661, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.07630899187603161, + "language_loss": 0.98461914, + "learning_rate": 0.0009999719503237174, + "loss": 0.99625951, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.24865723, + "step": 173, + "time_per_iteration": 2.7653093338012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119131, + "balance_loss_mlp": 1.16379607, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11225996182460839, + "language_loss": 1.07204938, + "learning_rate": 0.0009999685533397073, + "loss": 1.08396256, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.27514648, + "step": 174, + "time_per_iteration": 2.560985565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174784, + "balance_loss_mlp": 1.14617324, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.09969766363227954, + "language_loss": 0.99471402, + "learning_rate": 0.00099996496225069, + "loss": 1.00646186, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.28637695, + "step": 175, + "time_per_iteration": 2.685511589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191936, + "balance_loss_mlp": 1.16053653, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07081110533815538, + "language_loss": 1.01830065, + "learning_rate": 0.0009999611770580604, + "loss": 1.03022003, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.31396484, + "step": 176, + "time_per_iteration": 2.848646879196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184826, + "balance_loss_mlp": 1.1498735, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08630072774372038, + "language_loss": 1.00571251, + "learning_rate": 0.0009999571977632876, + "loss": 1.01756072, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.34960938, + "step": 177, + "time_per_iteration": 2.5936646461486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183799, + "balance_loss_mlp": 1.14896631, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.14796843181278477, + "language_loss": 1.03395152, + "learning_rate": 0.0009999530243679166, + "loss": 1.04578948, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.34863281, + "step": 178, + "time_per_iteration": 2.578585386276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119148, + "balance_loss_mlp": 1.15502596, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.07456630143082679, + "language_loss": 0.98466933, + "learning_rate": 0.0009999486568735675, + "loss": 0.99658418, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.36450195, + "step": 179, + "time_per_iteration": 3.0958807468414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204458, + "balance_loss_mlp": 1.16657281, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.1071704794457763, + "language_loss": 0.98888862, + "learning_rate": 0.0009999440952819362, + "loss": 1.00093329, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.37841797, + "step": 180, + "time_per_iteration": 3.7027652263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119721, + "balance_loss_mlp": 1.1615665, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.09808227157719927, + "language_loss": 0.98941529, + "learning_rate": 0.0009999393395947935, + "loss": 1.00138736, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.35644531, + "step": 181, + "time_per_iteration": 2.9549217224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178907, + "balance_loss_mlp": 1.1453855, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.07390299993950959, + "language_loss": 1.01616848, + "learning_rate": 0.0009999343898139858, + "loss": 1.02795744, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.33520508, + "step": 182, + "time_per_iteration": 2.6392982006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183604, + "balance_loss_mlp": 1.15117884, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.07686941510438546, + "language_loss": 1.00897217, + "learning_rate": 0.0009999292459414348, + "loss": 1.02080822, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.32397461, + "step": 183, + "time_per_iteration": 2.657658338546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158427, + "balance_loss_mlp": 1.12702751, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08111160194171327, + "language_loss": 1.04917085, + "learning_rate": 0.0009999239079791374, + "loss": 1.06075525, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.3137207, + "step": 184, + "time_per_iteration": 2.631359815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115468, + "balance_loss_mlp": 1.12359011, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.06813164935019152, + "language_loss": 0.98483247, + "learning_rate": 0.0009999183759291659, + "loss": 0.99637926, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.31054688, + "step": 185, + "time_per_iteration": 2.7329554557800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133475, + "balance_loss_mlp": 1.10393476, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.1084122935019402, + "language_loss": 1.00212467, + "learning_rate": 0.0009999126497936682, + "loss": 1.01345944, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.29516602, + "step": 186, + "time_per_iteration": 2.5334415435791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110251, + "balance_loss_mlp": 1.08080626, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057007065611444814, + "language_loss": 1.03274298, + "learning_rate": 0.0009999067295748676, + "loss": 1.04384542, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.29443359, + "step": 187, + "time_per_iteration": 2.8514976501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120219, + "balance_loss_mlp": 1.09280062, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.1063888726335035, + "language_loss": 1.00729585, + "learning_rate": 0.000999900615275062, + "loss": 1.01849806, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.27441406, + "step": 188, + "time_per_iteration": 2.7070038318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115411, + "balance_loss_mlp": 1.08773041, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.10114859104821755, + "language_loss": 1.06676006, + "learning_rate": 0.0009998943068966256, + "loss": 1.07791412, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.27709961, + "step": 189, + "time_per_iteration": 2.459259271621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128247, + "balance_loss_mlp": 1.0989449, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.09267057508255847, + "language_loss": 1.01174653, + "learning_rate": 0.0009998878044420072, + "loss": 1.02302897, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.29296875, + "step": 190, + "time_per_iteration": 2.710602045059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128613, + "balance_loss_mlp": 1.09881067, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06756260422642338, + "language_loss": 0.9758327, + "learning_rate": 0.0009998811079137318, + "loss": 0.98711884, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.2980957, + "step": 191, + "time_per_iteration": 2.657074451446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144449, + "balance_loss_mlp": 1.11092758, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.08238379115749897, + "language_loss": 0.98845601, + "learning_rate": 0.0009998742173143987, + "loss": 0.99990052, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.33544922, + "step": 192, + "time_per_iteration": 2.637148857116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155548, + "balance_loss_mlp": 1.12164509, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.08708605999438765, + "language_loss": 0.98628879, + "learning_rate": 0.0009998671326466833, + "loss": 0.99784422, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.33911133, + "step": 193, + "time_per_iteration": 3.0115370750427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152209, + "balance_loss_mlp": 1.11556399, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.10177169507488108, + "language_loss": 0.98986697, + "learning_rate": 0.0009998598539133362, + "loss": 1.00138903, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.36645508, + "step": 194, + "time_per_iteration": 3.144454002380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161775, + "balance_loss_mlp": 1.12694228, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.0772667916065631, + "language_loss": 1.00733018, + "learning_rate": 0.0009998523811171828, + "loss": 1.01894796, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.34863281, + "step": 195, + "time_per_iteration": 2.577711820602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158404, + "balance_loss_mlp": 1.12314177, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.12690614805983907, + "language_loss": 1.00355625, + "learning_rate": 0.0009998447142611248, + "loss": 1.0151403, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.3527832, + "step": 196, + "time_per_iteration": 2.690129041671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157374, + "balance_loss_mlp": 1.12332833, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.06577024943575122, + "language_loss": 0.94151276, + "learning_rate": 0.0009998368533481387, + "loss": 0.9530865, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.34057617, + "step": 197, + "time_per_iteration": 3.045903444290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135441, + "balance_loss_mlp": 1.10120416, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.07117988238902957, + "language_loss": 0.9709003, + "learning_rate": 0.0009998287983812762, + "loss": 0.98225474, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.34277344, + "step": 198, + "time_per_iteration": 2.8663957118988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153893, + "balance_loss_mlp": 1.11910725, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08607783918573575, + "language_loss": 1.02875066, + "learning_rate": 0.0009998205493636646, + "loss": 1.04028964, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.34790039, + "step": 199, + "time_per_iteration": 2.6874265670776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113429, + "balance_loss_mlp": 1.10010099, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.06925160872633124, + "language_loss": 0.95776969, + "learning_rate": 0.0009998121062985063, + "loss": 0.96911263, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.34179688, + "step": 200, + "time_per_iteration": 2.7165024280548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137597, + "balance_loss_mlp": 1.10424268, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.05789149863906192, + "language_loss": 0.98006374, + "learning_rate": 0.0009998034691890794, + "loss": 0.9914397, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.33349609, + "step": 201, + "time_per_iteration": 2.8032913208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122408, + "balance_loss_mlp": 1.09148479, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07027358299059557, + "language_loss": 1.02082264, + "learning_rate": 0.0009997946380387369, + "loss": 1.03204679, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.30932617, + "step": 202, + "time_per_iteration": 2.6880364418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_mlp": 1.08206952, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.06026816631059916, + "language_loss": 1.0439496, + "learning_rate": 0.0009997856128509076, + "loss": 1.05508685, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.31665039, + "step": 203, + "time_per_iteration": 2.8704147338867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120628, + "balance_loss_mlp": 1.089324, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.09379084264769941, + "language_loss": 0.99581945, + "learning_rate": 0.0009997763936290952, + "loss": 1.00702572, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.31298828, + "step": 204, + "time_per_iteration": 2.527740478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131571, + "balance_loss_mlp": 1.09924173, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09654929574768753, + "language_loss": 1.03863358, + "learning_rate": 0.0009997669803768789, + "loss": 1.04994941, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.32324219, + "step": 205, + "time_per_iteration": 2.7987287044525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114569, + "balance_loss_mlp": 1.08383656, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07990731679878747, + "language_loss": 0.99632657, + "learning_rate": 0.0009997573730979134, + "loss": 1.00747228, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.30712891, + "step": 206, + "time_per_iteration": 2.73876953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03108122, + "balance_loss_mlp": 2.9547708, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.40060181244225235, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82301319, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.53125, + "step": 207, + "time_per_iteration": 4.722966432571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165141, + "balance_loss_mlp": 1.13169098, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.10630094281595456, + "language_loss": 0.98190439, + "learning_rate": 0.0009997375764747294, + "loss": 0.99355578, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.33447266, + "step": 208, + "time_per_iteration": 3.063753128051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176428, + "balance_loss_mlp": 1.14395499, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.08070097632442315, + "language_loss": 0.96488065, + "learning_rate": 0.0009997273871381967, + "loss": 0.97664487, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.32470703, + "step": 209, + "time_per_iteration": 2.738070249557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199702, + "balance_loss_mlp": 1.16675293, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07976675940517855, + "language_loss": 1.01156783, + "learning_rate": 0.0009997170037902862, + "loss": 1.02356482, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.32958984, + "step": 210, + "time_per_iteration": 2.7301113605499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207423, + "balance_loss_mlp": 1.17323339, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.10146454126791286, + "language_loss": 1.03771198, + "learning_rate": 0.0009997064264350292, + "loss": 1.04978609, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.34228516, + "step": 211, + "time_per_iteration": 2.8760437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199912, + "balance_loss_mlp": 1.16364872, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07215586978638981, + "language_loss": 0.9769634, + "learning_rate": 0.0009996956550765317, + "loss": 0.98896253, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.36254883, + "step": 212, + "time_per_iteration": 2.704005479812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209491, + "balance_loss_mlp": 1.17389536, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07781252726849613, + "language_loss": 0.9221555, + "learning_rate": 0.0009996846897189762, + "loss": 0.93425035, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.35595703, + "step": 213, + "time_per_iteration": 2.6465373039245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209061, + "balance_loss_mlp": 1.17510998, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.09713937665314668, + "language_loss": 0.99115217, + "learning_rate": 0.0009996735303666193, + "loss": 1.00324273, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.33984375, + "step": 214, + "time_per_iteration": 2.7262256145477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217734, + "balance_loss_mlp": 1.18261504, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.0828033847449013, + "language_loss": 1.01114583, + "learning_rate": 0.0009996621770237937, + "loss": 1.02332306, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.35131836, + "step": 215, + "time_per_iteration": 2.774261951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228156, + "balance_loss_mlp": 1.19122505, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.09368483866206018, + "language_loss": 0.9696883, + "learning_rate": 0.0009996506296949073, + "loss": 0.98196977, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.36889648, + "step": 216, + "time_per_iteration": 2.9090492725372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227468, + "balance_loss_mlp": 1.18944013, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.07734539448931728, + "language_loss": 0.9667756, + "learning_rate": 0.0009996388883844428, + "loss": 0.97905028, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.38037109, + "step": 217, + "time_per_iteration": 2.6576592922210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219854, + "balance_loss_mlp": 1.18299437, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.05044055802439308, + "language_loss": 1.01232481, + "learning_rate": 0.0009996269530969588, + "loss": 1.02452338, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.36865234, + "step": 218, + "time_per_iteration": 2.5997114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212901, + "balance_loss_mlp": 1.17787719, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.09536649242864963, + "language_loss": 0.99537694, + "learning_rate": 0.0009996148238370888, + "loss": 1.00750601, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.3503418, + "step": 219, + "time_per_iteration": 2.794192314147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210779, + "balance_loss_mlp": 1.17465854, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.05253448416537987, + "language_loss": 0.95164675, + "learning_rate": 0.0009996025006095421, + "loss": 0.96375448, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.36132812, + "step": 220, + "time_per_iteration": 3.387816905975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03435379, + "balance_loss_mlp": 3.13935852, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.2631843872282414, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81218523, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.953125, + "step": 221, + "time_per_iteration": 5.1907196044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198698, + "balance_loss_mlp": 1.16422272, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.084470461812628, + "language_loss": 0.96455717, + "learning_rate": 0.0009995772722706307, + "loss": 0.97654414, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.3449707, + "step": 222, + "time_per_iteration": 2.817683219909668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196045, + "balance_loss_mlp": 1.16013885, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.09039049489351958, + "language_loss": 1.09978271, + "learning_rate": 0.0009995643671690604, + "loss": 1.11174321, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.35888672, + "step": 223, + "time_per_iteration": 2.473952293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187658, + "balance_loss_mlp": 1.15511417, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.098631665550633, + "language_loss": 0.9726367, + "learning_rate": 0.0009995512681194023, + "loss": 0.98451328, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.32543945, + "step": 224, + "time_per_iteration": 2.8320233821868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173941, + "balance_loss_mlp": 1.14256525, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.09492392354161142, + "language_loss": 0.95751745, + "learning_rate": 0.0009995379751267417, + "loss": 0.96925682, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.31347656, + "step": 225, + "time_per_iteration": 3.265004873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_mlp": 1.11923385, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.07692691631212083, + "language_loss": 0.96905231, + "learning_rate": 0.0009995244881962398, + "loss": 0.98056507, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.3203125, + "step": 226, + "time_per_iteration": 2.6380093097686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122529, + "balance_loss_mlp": 1.09217834, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.1280080940779162, + "language_loss": 0.97453952, + "learning_rate": 0.0009995108073331323, + "loss": 0.98576486, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.30322266, + "step": 227, + "time_per_iteration": 2.611384630203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116442, + "balance_loss_mlp": 1.08482742, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05834750559212819, + "language_loss": 1.00860834, + "learning_rate": 0.0009994969325427309, + "loss": 1.01977265, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.31640625, + "step": 228, + "time_per_iteration": 2.690300941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123233, + "balance_loss_mlp": 1.08851922, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.06096273128167382, + "language_loss": 0.96635395, + "learning_rate": 0.0009994828638304218, + "loss": 0.97758633, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.34716797, + "step": 229, + "time_per_iteration": 2.666841506958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128717, + "balance_loss_mlp": 1.093979, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.08326095283531681, + "language_loss": 1.02012706, + "learning_rate": 0.0009994686012016675, + "loss": 1.03141427, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.34765625, + "step": 230, + "time_per_iteration": 2.5846869945526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153704, + "balance_loss_mlp": 1.12056351, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.09069798767816209, + "language_loss": 1.01831698, + "learning_rate": 0.000999454144662005, + "loss": 1.02985406, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.33154297, + "step": 231, + "time_per_iteration": 2.923693895339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115334, + "balance_loss_mlp": 1.11736226, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.09055690768180072, + "language_loss": 0.95778871, + "learning_rate": 0.0009994394942170468, + "loss": 0.9693222, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.35961914, + "step": 232, + "time_per_iteration": 2.7030160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142015, + "balance_loss_mlp": 1.10806465, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.058800323500690845, + "language_loss": 0.93958372, + "learning_rate": 0.0009994246498724808, + "loss": 0.95100385, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33984375, + "step": 233, + "time_per_iteration": 2.7212979793548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138192, + "balance_loss_mlp": 1.10519481, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06773344256027236, + "language_loss": 0.96352422, + "learning_rate": 0.00099940961163407, + "loss": 0.97490609, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33007812, + "step": 234, + "time_per_iteration": 2.901205062866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136264, + "balance_loss_mlp": 1.10300505, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061338570366332154, + "language_loss": 0.98733097, + "learning_rate": 0.0009993943795076528, + "loss": 0.99869365, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33251953, + "step": 235, + "time_per_iteration": 2.6201589107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132034, + "balance_loss_mlp": 1.09834564, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07983858027410345, + "language_loss": 1.00555849, + "learning_rate": 0.0009993789534991427, + "loss": 1.01687884, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.33691406, + "step": 236, + "time_per_iteration": 2.454946279525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132728, + "balance_loss_mlp": 1.0996356, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.08392760705769248, + "language_loss": 0.95816457, + "learning_rate": 0.0009993633336145287, + "loss": 0.96949184, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.33056641, + "step": 237, + "time_per_iteration": 2.6566781997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128359, + "balance_loss_mlp": 1.09655356, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.08042180371297789, + "language_loss": 1.00147879, + "learning_rate": 0.0009993475198598752, + "loss": 1.01276243, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31811523, + "step": 238, + "time_per_iteration": 3.0513856410980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126034, + "balance_loss_mlp": 1.09301257, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.0829568534139584, + "language_loss": 0.95935237, + "learning_rate": 0.0009993315122413212, + "loss": 0.97061276, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33007812, + "step": 239, + "time_per_iteration": 2.659076690673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138389, + "balance_loss_mlp": 1.10458112, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.07781318144537454, + "language_loss": 0.96732402, + "learning_rate": 0.0009993153107650818, + "loss": 0.97870797, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.33813477, + "step": 240, + "time_per_iteration": 2.6491312980651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141869, + "balance_loss_mlp": 1.10593915, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09031233919320754, + "language_loss": 0.95913565, + "learning_rate": 0.0009992989154374468, + "loss": 0.97055435, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.35961914, + "step": 241, + "time_per_iteration": 2.5679047107696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153823, + "balance_loss_mlp": 1.11829901, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.07248816816072506, + "language_loss": 1.03108311, + "learning_rate": 0.0009992823262647817, + "loss": 1.04262137, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.35546875, + "step": 242, + "time_per_iteration": 2.7263669967651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146146, + "balance_loss_mlp": 1.11167073, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.08958834607355992, + "language_loss": 0.96952182, + "learning_rate": 0.0009992655432535264, + "loss": 0.98098326, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.3449707, + "step": 243, + "time_per_iteration": 2.7712135314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156392, + "balance_loss_mlp": 1.12115347, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.06980487860605987, + "language_loss": 0.97863543, + "learning_rate": 0.0009992485664101973, + "loss": 0.99019933, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.35229492, + "step": 244, + "time_per_iteration": 2.7024378776550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164354, + "balance_loss_mlp": 1.12825704, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.059856394455686884, + "language_loss": 0.9987036, + "learning_rate": 0.000999231395741385, + "loss": 1.01034713, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.36108398, + "step": 245, + "time_per_iteration": 3.1183571815490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165942, + "balance_loss_mlp": 1.13215792, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.0943556711706318, + "language_loss": 0.97312224, + "learning_rate": 0.0009992140312537557, + "loss": 0.98478168, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.33789062, + "step": 246, + "time_per_iteration": 2.6516497135162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144759, + "balance_loss_mlp": 1.11121345, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.07660143361567079, + "language_loss": 0.93426013, + "learning_rate": 0.000999196472954051, + "loss": 0.94570768, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.33569336, + "step": 247, + "time_per_iteration": 2.975703477859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06084414, + "balance_loss_mlp": 5.7578764, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.5887991941215185, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.85509264, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 3.265625, + "step": 248, + "time_per_iteration": 5.566707372665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147412, + "balance_loss_mlp": 1.11617875, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.08054908277290292, + "language_loss": 0.99819887, + "learning_rate": 0.0009991607749457578, + "loss": 1.00967312, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.31225586, + "step": 249, + "time_per_iteration": 2.601257801055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179313, + "balance_loss_mlp": 1.14769912, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.0802426637388702, + "language_loss": 0.97979879, + "learning_rate": 0.0009991426352510286, + "loss": 0.99159187, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31591797, + "step": 250, + "time_per_iteration": 3.036884069442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221052, + "balance_loss_mlp": 1.18660045, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.10047763480801107, + "language_loss": 0.99211901, + "learning_rate": 0.0009991243017719422, + "loss": 1.00432956, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.34448242, + "step": 251, + "time_per_iteration": 2.6728298664093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221015, + "balance_loss_mlp": 1.18696856, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09158100422304945, + "language_loss": 0.93989825, + "learning_rate": 0.0009991057745156165, + "loss": 0.95210844, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34033203, + "step": 252, + "time_per_iteration": 2.6554462909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03874573, + "balance_loss_mlp": 3.65637207, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.4297237142905687, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85785556, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 2.1875, + "step": 253, + "time_per_iteration": 5.027901649475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243575, + "balance_loss_mlp": 1.20886147, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.13167813172379958, + "language_loss": 1.02751815, + "learning_rate": 0.0009990681387000943, + "loss": 1.03995395, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.34716797, + "step": 254, + "time_per_iteration": 2.830775260925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287984, + "balance_loss_mlp": 1.25019443, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.07749867859212424, + "language_loss": 0.9817788, + "learning_rate": 0.0009990490301555093, + "loss": 0.99465859, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.37792969, + "step": 255, + "time_per_iteration": 2.9786195755004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04361559, + "balance_loss_mlp": 4.02739191, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.4777758897592442, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.83576715, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 3.34375, + "step": 256, + "time_per_iteration": 4.883893013000488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03108787, + "balance_loss_mlp": 2.91576314, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.20418401203526695, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8235153, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.9296875, + "step": 257, + "time_per_iteration": 4.981754541397095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03419801, + "balance_loss_mlp": 3.31070042, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4614192090098086, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73395681, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.09375, + "step": 258, + "time_per_iteration": 4.904312372207642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147617, + "balance_loss_mlp": 1.43730807, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.19757670960702672, + "language_loss": 0.92998719, + "learning_rate": 0.0009989706585723202, + "loss": 0.94474888, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.38867188, + "step": 259, + "time_per_iteration": 2.8021159172058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01539233, + "balance_loss_mlp": 1.49808145, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.19510301282339976, + "language_loss": 0.99383926, + "learning_rate": 0.0009989505813633442, + "loss": 1.00923157, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.41137695, + "step": 260, + "time_per_iteration": 2.6653668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01478791, + "balance_loss_mlp": 1.4348743, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.10786879930825251, + "language_loss": 0.98759341, + "learning_rate": 0.000998930310444573, + "loss": 1.00238132, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.43920898, + "step": 261, + "time_per_iteration": 2.7604081630706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432802, + "balance_loss_mlp": 1.38426006, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.09058377349206405, + "language_loss": 0.96455801, + "learning_rate": 0.0009989098458238765, + "loss": 0.97888601, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.4855957, + "step": 262, + "time_per_iteration": 2.8061673641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01428574, + "balance_loss_mlp": 1.3737855, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.09431506628041801, + "language_loss": 0.959288, + "learning_rate": 0.0009988891875091998, + "loss": 0.9735738, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.54833984, + "step": 263, + "time_per_iteration": 2.756467819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142654, + "balance_loss_mlp": 1.36974835, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.10391907645657336, + "language_loss": 0.90729272, + "learning_rate": 0.0009988683355085636, + "loss": 0.92155808, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.56787109, + "step": 264, + "time_per_iteration": 2.7685976028442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420401, + "balance_loss_mlp": 1.3644681, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09802606586789958, + "language_loss": 0.99670649, + "learning_rate": 0.000998847289830063, + "loss": 1.01091051, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.55957031, + "step": 265, + "time_per_iteration": 2.831874132156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390772, + "balance_loss_mlp": 1.34082305, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.13175698376961376, + "language_loss": 0.92018604, + "learning_rate": 0.0009988260504818682, + "loss": 0.93409377, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.49926758, + "step": 266, + "time_per_iteration": 2.5666043758392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364075, + "balance_loss_mlp": 1.31720233, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.11617121831129276, + "language_loss": 0.98586178, + "learning_rate": 0.000998804617472226, + "loss": 0.99950248, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.46899414, + "step": 267, + "time_per_iteration": 2.683875322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339401, + "balance_loss_mlp": 1.29844046, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.13482471872787388, + "language_loss": 0.93566334, + "learning_rate": 0.0009987829908094568, + "loss": 0.94905734, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.40966797, + "step": 268, + "time_per_iteration": 2.844641923904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_mlp": 1.23007023, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.10753691268900553, + "language_loss": 1.00233316, + "learning_rate": 0.0009987611705019569, + "loss": 1.01503825, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.40454102, + "step": 269, + "time_per_iteration": 4.188141107559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223109, + "balance_loss_mlp": 1.1811955, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.09459011438584931, + "language_loss": 0.9928273, + "learning_rate": 0.0009987391565581978, + "loss": 1.00505841, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.41943359, + "step": 270, + "time_per_iteration": 2.603743076324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187631, + "balance_loss_mlp": 1.14400077, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06481058483540457, + "language_loss": 0.91893035, + "learning_rate": 0.000998716948986726, + "loss": 0.93080664, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.43652344, + "step": 271, + "time_per_iteration": 2.8780717849731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189162, + "balance_loss_mlp": 1.14545989, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0816946734367831, + "language_loss": 0.93787229, + "learning_rate": 0.0009986945477961633, + "loss": 0.94976389, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.4375, + "step": 272, + "time_per_iteration": 2.723017692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181133, + "balance_loss_mlp": 1.13828969, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.0734282707774283, + "language_loss": 0.99389303, + "learning_rate": 0.0009986719529952066, + "loss": 1.00570428, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.4284668, + "step": 273, + "time_per_iteration": 2.8852784633636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175268, + "balance_loss_mlp": 1.13082659, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.10629611668364672, + "language_loss": 0.98564589, + "learning_rate": 0.000998649164592628, + "loss": 0.99739856, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.44458008, + "step": 274, + "time_per_iteration": 2.616504430770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151835, + "balance_loss_mlp": 1.1077987, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.10641977070505904, + "language_loss": 0.95747149, + "learning_rate": 0.0009986261825972748, + "loss": 0.96898991, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.44018555, + "step": 275, + "time_per_iteration": 2.7185463905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.12447667, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09271858345864015, + "language_loss": 0.98292786, + "learning_rate": 0.000998603007018069, + "loss": 0.99463308, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.46044922, + "step": 276, + "time_per_iteration": 2.884373188018799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120113, + "balance_loss_mlp": 1.15065718, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06824174267425122, + "language_loss": 0.95424223, + "learning_rate": 0.0009985796378640089, + "loss": 0.96625352, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.50512695, + "step": 277, + "time_per_iteration": 2.766671895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196508, + "balance_loss_mlp": 1.14670205, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07462742938020851, + "language_loss": 0.95504081, + "learning_rate": 0.0009985560751441665, + "loss": 0.96700585, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.49829102, + "step": 278, + "time_per_iteration": 2.8290188312530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202501, + "balance_loss_mlp": 1.1519084, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.08249787624351518, + "language_loss": 0.97367889, + "learning_rate": 0.00099853231886769, + "loss": 0.98570395, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.5065918, + "step": 279, + "time_per_iteration": 2.7985732555389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208136, + "balance_loss_mlp": 1.15880692, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06817333546872655, + "language_loss": 0.98251152, + "learning_rate": 0.0009985083690438024, + "loss": 0.99459285, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.4934082, + "step": 280, + "time_per_iteration": 2.711107015609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120231, + "balance_loss_mlp": 1.15419662, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.06285675396315912, + "language_loss": 0.88899338, + "learning_rate": 0.0009984842256818016, + "loss": 0.90101647, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.48095703, + "step": 281, + "time_per_iteration": 3.089395761489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118336, + "balance_loss_mlp": 1.13934779, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.09184892817545263, + "language_loss": 0.99464393, + "learning_rate": 0.0009984598887910613, + "loss": 1.00647748, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.43994141, + "step": 282, + "time_per_iteration": 2.809372663497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193758, + "balance_loss_mlp": 1.14736223, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0862697219544723, + "language_loss": 0.95099992, + "learning_rate": 0.0009984353583810297, + "loss": 0.96293747, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.46386719, + "step": 283, + "time_per_iteration": 2.887547016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174794, + "balance_loss_mlp": 1.12997127, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.07077343192171563, + "language_loss": 0.96608889, + "learning_rate": 0.0009984106344612302, + "loss": 0.97783673, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.44799805, + "step": 284, + "time_per_iteration": 2.7930290699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158485, + "balance_loss_mlp": 1.11640382, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.07340568947827376, + "language_loss": 0.92955279, + "learning_rate": 0.0009983857170412615, + "loss": 0.94113761, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.42089844, + "step": 285, + "time_per_iteration": 3.0093743801116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165069, + "balance_loss_mlp": 1.1219871, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.05960836075086468, + "language_loss": 0.92676461, + "learning_rate": 0.000998360606130798, + "loss": 0.93841541, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.43041992, + "step": 286, + "time_per_iteration": 2.837170362472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03710432, + "balance_loss_mlp": 3.53495646, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.1985650778679295, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.72783548, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.7578125, + "step": 287, + "time_per_iteration": 4.929426908493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157732, + "balance_loss_mlp": 1.11290884, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08938470510968509, + "language_loss": 0.98063755, + "learning_rate": 0.0009983098038774552, + "loss": 0.99221486, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.44799805, + "step": 288, + "time_per_iteration": 2.8677265644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03445158, + "balance_loss_mlp": 3.31088066, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.2206810579053755, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.81615388, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.34375, + "step": 289, + "time_per_iteration": 4.795354604721069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204321, + "balance_loss_mlp": 1.15992737, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.08343344919246831, + "language_loss": 0.96212429, + "learning_rate": 0.0009982582277800948, + "loss": 0.97416747, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.44360352, + "step": 290, + "time_per_iteration": 2.610515832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201009, + "balance_loss_mlp": 1.15659118, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09373610552028779, + "language_loss": 1.02980018, + "learning_rate": 0.0009982321495648908, + "loss": 1.04181027, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.4440918, + "step": 291, + "time_per_iteration": 2.847222089767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213433, + "balance_loss_mlp": 1.16884899, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.12267020035660053, + "language_loss": 0.94884562, + "learning_rate": 0.0009982058779188115, + "loss": 0.96097994, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.44604492, + "step": 292, + "time_per_iteration": 2.7585439682006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190958, + "balance_loss_mlp": 1.14596868, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.07287048907504978, + "language_loss": 1.01494539, + "learning_rate": 0.0009981794128520567, + "loss": 1.02685499, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.44970703, + "step": 293, + "time_per_iteration": 2.8542449474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194856, + "balance_loss_mlp": 1.14817381, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.060100997943795566, + "language_loss": 0.98246396, + "learning_rate": 0.000998152754374901, + "loss": 0.99441248, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.46704102, + "step": 294, + "time_per_iteration": 2.897792100906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183893, + "balance_loss_mlp": 1.13856936, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.0698691020933478, + "language_loss": 0.94496101, + "learning_rate": 0.0009981259024976943, + "loss": 0.95679998, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.453125, + "step": 295, + "time_per_iteration": 2.7404842376708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186618, + "balance_loss_mlp": 1.14067447, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.10167990029855892, + "language_loss": 0.92340136, + "learning_rate": 0.0009980988572308612, + "loss": 0.93526757, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.45922852, + "step": 296, + "time_per_iteration": 3.007516384124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169257, + "balance_loss_mlp": 1.12450624, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.07320939901236567, + "language_loss": 0.95507723, + "learning_rate": 0.0009980716185849015, + "loss": 0.96676981, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44775391, + "step": 297, + "time_per_iteration": 2.9953107833862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163884, + "balance_loss_mlp": 1.12180316, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06695295039959538, + "language_loss": 0.92045325, + "learning_rate": 0.0009980441865703904, + "loss": 0.93209207, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4206543, + "step": 298, + "time_per_iteration": 2.6119296550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149691, + "balance_loss_mlp": 1.10896909, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07389257813376128, + "language_loss": 1.00092888, + "learning_rate": 0.000998016561197978, + "loss": 1.0124259, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.40698242, + "step": 299, + "time_per_iteration": 2.776057004928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139916, + "balance_loss_mlp": 1.10072017, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.08850581007108178, + "language_loss": 0.91981971, + "learning_rate": 0.0009979887424783895, + "loss": 0.93121886, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.39208984, + "step": 300, + "time_per_iteration": 2.9253783226013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114416, + "balance_loss_mlp": 1.10362935, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.06286570971506464, + "language_loss": 0.91965425, + "learning_rate": 0.0009979607304224248, + "loss": 0.93109584, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.40527344, + "step": 301, + "time_per_iteration": 2.7880210876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148274, + "balance_loss_mlp": 1.10626435, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07282163575611278, + "language_loss": 0.98193479, + "learning_rate": 0.000997932525040959, + "loss": 0.9934175, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.41992188, + "step": 302, + "time_per_iteration": 2.6913211345672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135024, + "balance_loss_mlp": 1.09647226, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.08010118219410382, + "language_loss": 1.00433981, + "learning_rate": 0.000997904126344943, + "loss": 1.01569009, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.38549805, + "step": 303, + "time_per_iteration": 2.648486375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_mlp": 1.112535, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.07274054196633538, + "language_loss": 0.95938694, + "learning_rate": 0.0009978755343454018, + "loss": 0.97091049, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.39794922, + "step": 304, + "time_per_iteration": 2.7488231658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162494, + "balance_loss_mlp": 1.12279713, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07785655900909055, + "language_loss": 0.97099572, + "learning_rate": 0.0009978467490534355, + "loss": 0.98262066, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.39697266, + "step": 305, + "time_per_iteration": 2.5928122997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161095, + "balance_loss_mlp": 1.12101698, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06710807116161162, + "language_loss": 0.94506705, + "learning_rate": 0.00099781777048022, + "loss": 0.95667803, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.40087891, + "step": 306, + "time_per_iteration": 2.7071874141693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12727094, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.06805578843696672, + "language_loss": 0.95336848, + "learning_rate": 0.0009977885986370057, + "loss": 0.96503407, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.39282227, + "step": 307, + "time_per_iteration": 2.560727119445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181694, + "balance_loss_mlp": 1.14190209, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07408509854998435, + "language_loss": 0.92084455, + "learning_rate": 0.000997759233535118, + "loss": 0.93266147, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.39770508, + "step": 308, + "time_per_iteration": 2.811706304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199188, + "balance_loss_mlp": 1.15813279, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.11332767927985109, + "language_loss": 0.97065681, + "learning_rate": 0.0009977296751859576, + "loss": 0.98264867, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.41040039, + "step": 309, + "time_per_iteration": 2.8100500106811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182124, + "balance_loss_mlp": 1.14152098, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.06886541031259097, + "language_loss": 0.99580777, + "learning_rate": 0.0009976999236009998, + "loss": 1.00762904, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.40576172, + "step": 310, + "time_per_iteration": 2.7856838703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116778, + "balance_loss_mlp": 1.12984788, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.07671642451322926, + "language_loss": 1.00938904, + "learning_rate": 0.0009976699787917955, + "loss": 1.02106678, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37939453, + "step": 311, + "time_per_iteration": 2.679760217666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01912771, + "balance_loss_mlp": 1.87653184, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.11004817833063929, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75355768, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.36328125, + "step": 312, + "time_per_iteration": 5.035902976989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167025, + "balance_loss_mlp": 1.12742412, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.08745367830689305, + "language_loss": 0.92707014, + "learning_rate": 0.0009976095095472243, + "loss": 0.93874037, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39575195, + "step": 313, + "time_per_iteration": 2.606323480606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137298, + "balance_loss_mlp": 1.10091519, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.07680079441574393, + "language_loss": 0.94012022, + "learning_rate": 0.0009975789851353334, + "loss": 0.95149314, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.36352539, + "step": 314, + "time_per_iteration": 2.838961362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135404, + "balance_loss_mlp": 1.10076201, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07916345547758051, + "language_loss": 0.96821368, + "learning_rate": 0.0009975482675461487, + "loss": 0.97956777, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.34643555, + "step": 315, + "time_per_iteration": 2.6935253143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122263, + "balance_loss_mlp": 1.08905149, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.06025100036286014, + "language_loss": 0.94348001, + "learning_rate": 0.0009975173567915952, + "loss": 0.95470262, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.33203125, + "step": 316, + "time_per_iteration": 2.784148931503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123821, + "balance_loss_mlp": 1.08903599, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.06288070363718151, + "language_loss": 0.8781901, + "learning_rate": 0.000997486252883674, + "loss": 0.88942832, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.34765625, + "step": 317, + "time_per_iteration": 2.8335070610046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130287, + "balance_loss_mlp": 1.09628844, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.08951651385504938, + "language_loss": 0.93891156, + "learning_rate": 0.0009974549558344602, + "loss": 0.95021445, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.33984375, + "step": 318, + "time_per_iteration": 3.661447048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140147, + "balance_loss_mlp": 1.10564828, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.0956877361367619, + "language_loss": 1.0199635, + "learning_rate": 0.000997423465656105, + "loss": 1.03136492, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.34521484, + "step": 319, + "time_per_iteration": 2.7822437286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124104, + "balance_loss_mlp": 1.08896148, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.10289983756800847, + "language_loss": 0.99710345, + "learning_rate": 0.0009973917823608335, + "loss": 1.00834441, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.3515625, + "step": 320, + "time_per_iteration": 2.631345272064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135481, + "balance_loss_mlp": 1.09964669, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.0680580088694669, + "language_loss": 0.95663267, + "learning_rate": 0.0009973599059609462, + "loss": 0.96798748, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.35839844, + "step": 321, + "time_per_iteration": 2.7266485691070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117054, + "balance_loss_mlp": 1.13201189, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.07460436538347456, + "language_loss": 0.9288404, + "learning_rate": 0.000997327836468819, + "loss": 0.9405458, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.38525391, + "step": 322, + "time_per_iteration": 2.673107385635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179324, + "balance_loss_mlp": 1.14246416, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.08768405045584388, + "language_loss": 0.95868701, + "learning_rate": 0.000997295573896902, + "loss": 0.9704802, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.36865234, + "step": 323, + "time_per_iteration": 2.89715838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974818, + "balance_loss_mlp": 1.93609941, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.15129070182137194, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83170861, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.38671875, + "step": 324, + "time_per_iteration": 4.777086496353149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01725792, + "balance_loss_mlp": 1.68592823, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.07336881302622408, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80297732, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.3984375, + "step": 325, + "time_per_iteration": 4.8852620124816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203893, + "balance_loss_mlp": 1.16684282, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.09305137701088195, + "language_loss": 0.90879977, + "learning_rate": 0.000997197627828043, + "loss": 0.92083865, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.37060547, + "step": 326, + "time_per_iteration": 2.615715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198178, + "balance_loss_mlp": 1.16174805, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.10757754770856821, + "language_loss": 0.86059356, + "learning_rate": 0.0009971645930629716, + "loss": 0.8725754, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.36450195, + "step": 327, + "time_per_iteration": 2.7753512859344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193235, + "balance_loss_mlp": 1.15790117, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0829627430200847, + "language_loss": 0.98908973, + "learning_rate": 0.0009971313652814872, + "loss": 1.00102198, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.35351562, + "step": 328, + "time_per_iteration": 2.8697071075439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183904, + "balance_loss_mlp": 1.14957154, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.07808148320278054, + "language_loss": 0.9654116, + "learning_rate": 0.0009970979444964903, + "loss": 0.97725058, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.34350586, + "step": 329, + "time_per_iteration": 3.013674259185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179752, + "balance_loss_mlp": 1.14446568, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.08385245466398004, + "language_loss": 0.97686106, + "learning_rate": 0.0009970643307209556, + "loss": 0.98865855, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.3527832, + "step": 330, + "time_per_iteration": 2.868323802947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168029, + "balance_loss_mlp": 1.13097858, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08206463725837071, + "language_loss": 0.93874633, + "learning_rate": 0.0009970305239679334, + "loss": 0.95042664, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.37060547, + "step": 331, + "time_per_iteration": 2.8225202560424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178202, + "balance_loss_mlp": 1.14210534, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.07579712662789459, + "language_loss": 0.98774493, + "learning_rate": 0.0009969965242505483, + "loss": 0.99952692, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.36108398, + "step": 332, + "time_per_iteration": 2.8107545375823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168447, + "balance_loss_mlp": 1.13325644, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.07917811788525977, + "language_loss": 0.94783902, + "learning_rate": 0.0009969623315820007, + "loss": 0.95952344, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.35180664, + "step": 333, + "time_per_iteration": 2.698505401611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171432, + "balance_loss_mlp": 1.13636017, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.0763666551446786, + "language_loss": 0.95210952, + "learning_rate": 0.000996927945975565, + "loss": 0.96382385, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.35083008, + "step": 334, + "time_per_iteration": 2.584472894668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115222, + "balance_loss_mlp": 1.11686206, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.08033900057819754, + "language_loss": 0.91956127, + "learning_rate": 0.0009968933674445906, + "loss": 0.93108344, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.35375977, + "step": 335, + "time_per_iteration": 2.689556837081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114559, + "balance_loss_mlp": 1.11109114, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.06825993333788044, + "language_loss": 0.94537115, + "learning_rate": 0.0009968585960025028, + "loss": 0.95682704, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.34521484, + "step": 336, + "time_per_iteration": 3.009956121444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02330067, + "balance_loss_mlp": 2.29764199, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.13230953132672904, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79983252, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.32421875, + "step": 337, + "time_per_iteration": 4.800926685333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126942, + "balance_loss_mlp": 1.09404051, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.06377137616855041, + "language_loss": 0.92515147, + "learning_rate": 0.0009967884744390583, + "loss": 0.93642092, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.32910156, + "step": 338, + "time_per_iteration": 3.5464487075805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124603, + "balance_loss_mlp": 1.09043801, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.0855348813631026, + "language_loss": 0.93111128, + "learning_rate": 0.0009967531243449256, + "loss": 0.9423573, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34155273, + "step": 339, + "time_per_iteration": 2.6777007579803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131136, + "balance_loss_mlp": 1.09642255, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07604626819248426, + "language_loss": 1.00833654, + "learning_rate": 0.000996717581394126, + "loss": 1.01964784, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.34741211, + "step": 340, + "time_per_iteration": 2.6667256355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145305, + "balance_loss_mlp": 1.10975671, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07959679456110856, + "language_loss": 1.00992751, + "learning_rate": 0.000996681845600459, + "loss": 1.02138054, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.35571289, + "step": 341, + "time_per_iteration": 2.6750872135162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168019, + "balance_loss_mlp": 1.13158822, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07803079751348088, + "language_loss": 0.92980075, + "learning_rate": 0.0009966459169777982, + "loss": 0.94148099, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.36425781, + "step": 342, + "time_per_iteration": 2.5240936279296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186775, + "balance_loss_mlp": 1.14920056, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07114695189108672, + "language_loss": 1.02233219, + "learning_rate": 0.0009966097955400924, + "loss": 1.03419995, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.37597656, + "step": 343, + "time_per_iteration": 2.701003313064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182072, + "balance_loss_mlp": 1.14444947, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.07450598076984326, + "language_loss": 0.95542282, + "learning_rate": 0.0009965734813013652, + "loss": 0.96724355, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.3762207, + "step": 344, + "time_per_iteration": 2.823782444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196951, + "balance_loss_mlp": 1.15773153, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.0604450427343926, + "language_loss": 0.97975069, + "learning_rate": 0.0009965369742757151, + "loss": 0.9917202, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.39208984, + "step": 345, + "time_per_iteration": 2.5793161392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222733, + "balance_loss_mlp": 1.18341792, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.07564429768448787, + "language_loss": 0.95189452, + "learning_rate": 0.0009965002744773152, + "loss": 0.96412188, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.39306641, + "step": 346, + "time_per_iteration": 3.5293569564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225458, + "balance_loss_mlp": 1.18573725, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.07389332256160373, + "language_loss": 0.91674209, + "learning_rate": 0.0009964633819204139, + "loss": 0.92899668, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.3972168, + "step": 347, + "time_per_iteration": 2.672184705734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01858873, + "balance_loss_mlp": 1.81805611, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.060031539331637095, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83659983, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40820312, + "step": 348, + "time_per_iteration": 4.947252988815308 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01829705, + "balance_loss_mlp": 1.78698003, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.05093987002559095, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76983589, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.42773438, + "step": 349, + "time_per_iteration": 4.949065923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200476, + "balance_loss_mlp": 1.16268659, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.10157855165040833, + "language_loss": 0.91835332, + "learning_rate": 0.000996351547842304, + "loss": 0.93035805, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.37792969, + "step": 350, + "time_per_iteration": 3.195343255996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_mlp": 1.13869905, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.09856595883672854, + "language_loss": 0.90272117, + "learning_rate": 0.0009963138843953744, + "loss": 0.91447508, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.3671875, + "step": 351, + "time_per_iteration": 2.6402506828308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141973, + "balance_loss_mlp": 1.10692537, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.061148145233813844, + "language_loss": 0.94241744, + "learning_rate": 0.000996276028262306, + "loss": 0.95383716, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.3503418, + "step": 352, + "time_per_iteration": 2.834099769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112107, + "balance_loss_mlp": 1.08011079, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.08429524036953953, + "language_loss": 1.00538242, + "learning_rate": 0.0009962379794577964, + "loss": 1.01650345, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.31982422, + "step": 353, + "time_per_iteration": 2.6607887744903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110445, + "balance_loss_mlp": 1.07780528, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.07871401687807635, + "language_loss": 0.91255635, + "learning_rate": 0.000996199737996617, + "loss": 0.92366076, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.32641602, + "step": 354, + "time_per_iteration": 2.977060317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106148, + "balance_loss_mlp": 1.07484412, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07891213217714192, + "language_loss": 0.99330544, + "learning_rate": 0.0009961613038936149, + "loss": 1.00436699, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.31274414, + "step": 355, + "time_per_iteration": 2.615016222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097974, + "balance_loss_mlp": 1.06619298, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.06589791904701883, + "language_loss": 0.92011106, + "learning_rate": 0.000996122677163711, + "loss": 0.93109083, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.31762695, + "step": 356, + "time_per_iteration": 2.844289541244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110581, + "balance_loss_mlp": 1.07848942, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.09636291923958067, + "language_loss": 0.97709715, + "learning_rate": 0.000996083857821902, + "loss": 0.98820293, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.32080078, + "step": 357, + "time_per_iteration": 3.0474655628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137059, + "balance_loss_mlp": 1.10334635, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.09472058747565097, + "language_loss": 0.95954913, + "learning_rate": 0.0009960448458832588, + "loss": 0.97091973, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.3371582, + "step": 358, + "time_per_iteration": 2.7681682109832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153475, + "balance_loss_mlp": 1.12002492, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.10342324791005938, + "language_loss": 0.95369232, + "learning_rate": 0.000996005641362927, + "loss": 0.96522713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.33447266, + "step": 359, + "time_per_iteration": 2.6423869132995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189921, + "balance_loss_mlp": 1.15472996, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.10829219970600838, + "language_loss": 0.98827034, + "learning_rate": 0.0009959662442761274, + "loss": 1.00016952, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.35205078, + "step": 360, + "time_per_iteration": 2.941234827041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185036, + "balance_loss_mlp": 1.14810538, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.0683919199988589, + "language_loss": 0.92245018, + "learning_rate": 0.000995926654638155, + "loss": 0.9343006, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.36938477, + "step": 361, + "time_per_iteration": 2.837684154510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202565, + "balance_loss_mlp": 1.16482282, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0951215156771631, + "language_loss": 0.9350909, + "learning_rate": 0.00099588687246438, + "loss": 0.94711655, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.37719727, + "step": 362, + "time_per_iteration": 2.9100425243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203748, + "balance_loss_mlp": 1.16460001, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.11257096193086513, + "language_loss": 1.01560402, + "learning_rate": 0.0009958468977702471, + "loss": 1.02764153, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.39160156, + "step": 363, + "time_per_iteration": 2.6317808628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01643136, + "balance_loss_mlp": 1.57790494, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.0741524690412032, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81377846, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.65234375, + "step": 364, + "time_per_iteration": 4.827174663543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187013, + "balance_loss_mlp": 1.15229964, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.07557653682461403, + "language_loss": 0.89914072, + "learning_rate": 0.0009957663708830612, + "loss": 0.91101086, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.34741211, + "step": 365, + "time_per_iteration": 3.280808448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201016, + "balance_loss_mlp": 1.16401315, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.11033601827195522, + "language_loss": 0.91889954, + "learning_rate": 0.0009957258187212714, + "loss": 0.93090969, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.36987305, + "step": 366, + "time_per_iteration": 3.0436058044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494507, + "balance_loss_mlp": 1.43309093, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.06331255113068197, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80689365, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.61328125, + "step": 367, + "time_per_iteration": 4.807323694229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209897, + "balance_loss_mlp": 1.17287028, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.07799784999620897, + "language_loss": 0.8953917, + "learning_rate": 0.0009956441370400167, + "loss": 0.90749061, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.37036133, + "step": 368, + "time_per_iteration": 2.678028106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218753, + "balance_loss_mlp": 1.18270361, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11766553351136624, + "language_loss": 0.9529528, + "learning_rate": 0.0009956030075522636, + "loss": 0.96514034, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3605957, + "step": 369, + "time_per_iteration": 2.7700181007385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195721, + "balance_loss_mlp": 1.16050696, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.07977968738165528, + "language_loss": 0.95944411, + "learning_rate": 0.0009955616856543587, + "loss": 0.97140133, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.35205078, + "step": 370, + "time_per_iteration": 2.6467819213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011765, + "balance_loss_mlp": 1.14142823, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.07610844541313569, + "language_loss": 0.88427055, + "learning_rate": 0.0009955201713623448, + "loss": 0.89603543, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.35083008, + "step": 371, + "time_per_iteration": 2.8926849365234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_mlp": 1.21208656, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.04749961224137468, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77934778, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "step": 372, + "time_per_iteration": 4.978249549865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137568, + "balance_loss_mlp": 1.10769427, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.10296972579398556, + "language_loss": 1.02312946, + "learning_rate": 0.0009954365656605333, + "loss": 1.03450513, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.29882812, + "step": 373, + "time_per_iteration": 2.5930416584014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163699, + "balance_loss_mlp": 1.1306777, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.08216565506059122, + "language_loss": 0.94662046, + "learning_rate": 0.0009953944742831947, + "loss": 0.95825744, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.33007812, + "step": 374, + "time_per_iteration": 3.02325701713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149175, + "balance_loss_mlp": 1.1185143, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.11719346683047478, + "language_loss": 0.98373723, + "learning_rate": 0.0009953521905766642, + "loss": 0.99522901, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.30639648, + "step": 375, + "time_per_iteration": 2.972064733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156803, + "balance_loss_mlp": 1.12435448, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.06602159555114745, + "language_loss": 0.97082627, + "learning_rate": 0.0009953097145573577, + "loss": 0.98239434, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.32446289, + "step": 376, + "time_per_iteration": 2.6647017002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183036, + "balance_loss_mlp": 1.14922833, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.0696983564537716, + "language_loss": 0.94069874, + "learning_rate": 0.000995267046241766, + "loss": 0.95252913, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.33837891, + "step": 377, + "time_per_iteration": 3.2699291706085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186549, + "balance_loss_mlp": 1.15281284, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.08226328739164854, + "language_loss": 0.94401312, + "learning_rate": 0.0009952241856464547, + "loss": 0.95587862, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.33764648, + "step": 378, + "time_per_iteration": 2.6432976722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191312, + "balance_loss_mlp": 1.15698004, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.12013480935274141, + "language_loss": 1.00853705, + "learning_rate": 0.0009951811327880632, + "loss": 1.02045012, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.34350586, + "step": 379, + "time_per_iteration": 2.822204828262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192206, + "balance_loss_mlp": 1.15858889, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.08341634879178654, + "language_loss": 0.94250029, + "learning_rate": 0.0009951378876833063, + "loss": 0.95442235, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.3359375, + "step": 380, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198017, + "balance_loss_mlp": 1.16311216, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.09052350379823415, + "language_loss": 1.00640893, + "learning_rate": 0.0009950944503489736, + "loss": 1.01838911, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34936523, + "step": 381, + "time_per_iteration": 2.758171796798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202881, + "balance_loss_mlp": 1.16811991, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.08361033479665086, + "language_loss": 0.95423895, + "learning_rate": 0.0009950508208019285, + "loss": 0.96626776, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.34741211, + "step": 382, + "time_per_iteration": 2.9980571269989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187801, + "balance_loss_mlp": 1.15489948, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.06841783055573346, + "language_loss": 0.99123466, + "learning_rate": 0.0009950069990591096, + "loss": 1.00311255, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.32910156, + "step": 383, + "time_per_iteration": 2.723439931869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01573707, + "balance_loss_mlp": 1.54185438, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.1397468631511101, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77975076, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.31835938, + "step": 384, + "time_per_iteration": 4.962388753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189426, + "balance_loss_mlp": 1.15592778, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.0845037323153299, + "language_loss": 0.92480063, + "learning_rate": 0.0009949187790542777, + "loss": 0.93669498, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.33496094, + "step": 385, + "time_per_iteration": 2.7766611576080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193494, + "balance_loss_mlp": 1.16052091, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.0971687641338208, + "language_loss": 0.884184, + "learning_rate": 0.0009948743808265148, + "loss": 0.89611894, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.32983398, + "step": 386, + "time_per_iteration": 2.674055576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183765, + "balance_loss_mlp": 1.15150666, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.07384542423184925, + "language_loss": 0.97962248, + "learning_rate": 0.0009948297904714782, + "loss": 0.9914602, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.32250977, + "step": 387, + "time_per_iteration": 2.698899745941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179922, + "balance_loss_mlp": 1.14680552, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.06832562007069648, + "language_loss": 0.90421599, + "learning_rate": 0.0009947850080064796, + "loss": 0.91601527, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.33105469, + "step": 388, + "time_per_iteration": 2.8182406425476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178557, + "balance_loss_mlp": 1.14639437, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.06958908790939329, + "language_loss": 0.94972193, + "learning_rate": 0.0009947400334489047, + "loss": 0.96150756, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.3215332, + "step": 389, + "time_per_iteration": 3.0191807746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180436, + "balance_loss_mlp": 1.14767742, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.0847539772518024, + "language_loss": 0.86555678, + "learning_rate": 0.0009946948668162145, + "loss": 0.87736106, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.32763672, + "step": 390, + "time_per_iteration": 2.7670745849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182484, + "balance_loss_mlp": 1.14886689, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.08648624436703037, + "language_loss": 0.91666478, + "learning_rate": 0.0009946495081259441, + "loss": 0.92848963, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.33618164, + "step": 391, + "time_per_iteration": 2.8355910778045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168167, + "balance_loss_mlp": 1.13454986, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.09254550247646448, + "language_loss": 0.94977629, + "learning_rate": 0.0009946039573957035, + "loss": 0.96145797, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.33618164, + "step": 392, + "time_per_iteration": 2.9576094150543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143715, + "balance_loss_mlp": 1.11300731, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.06908129255101257, + "language_loss": 0.91881704, + "learning_rate": 0.000994558214643177, + "loss": 0.93025422, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.30712891, + "step": 393, + "time_per_iteration": 2.757168769836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141782, + "balance_loss_mlp": 1.11102629, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.06274973991827922, + "language_loss": 0.93209511, + "learning_rate": 0.000994512279886123, + "loss": 0.94351292, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30712891, + "step": 394, + "time_per_iteration": 3.1078224182128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.10523462, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.07515736533799398, + "language_loss": 0.93902445, + "learning_rate": 0.0009944661531423758, + "loss": 0.95037174, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.29492188, + "step": 395, + "time_per_iteration": 2.6783392429351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149746, + "balance_loss_mlp": 1.12061143, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.07362715907626581, + "language_loss": 0.91989446, + "learning_rate": 0.000994419834429843, + "loss": 0.93139195, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29125977, + "step": 396, + "time_per_iteration": 2.6774208545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138515, + "balance_loss_mlp": 1.10887921, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.0979297809656427, + "language_loss": 0.95834494, + "learning_rate": 0.0009943733237665069, + "loss": 0.96973014, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.29663086, + "step": 397, + "time_per_iteration": 2.8543148040771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162601, + "balance_loss_mlp": 1.13260818, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.07305506526715269, + "language_loss": 0.95531559, + "learning_rate": 0.0009943266211704248, + "loss": 0.96694154, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.29956055, + "step": 398, + "time_per_iteration": 2.9546711444854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155699, + "balance_loss_mlp": 1.12427545, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.0773299202354709, + "language_loss": 0.97448099, + "learning_rate": 0.000994279726659728, + "loss": 0.98603797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31396484, + "step": 399, + "time_per_iteration": 2.51406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178721, + "balance_loss_mlp": 1.14610541, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.0761918911056457, + "language_loss": 0.9424448, + "learning_rate": 0.0009942326402526231, + "loss": 0.95423204, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.32617188, + "step": 400, + "time_per_iteration": 2.578338146209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175813, + "balance_loss_mlp": 1.14300704, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0730936916243032, + "language_loss": 0.93335903, + "learning_rate": 0.0009941853619673902, + "loss": 0.94511712, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.328125, + "step": 401, + "time_per_iteration": 2.6568922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175231, + "balance_loss_mlp": 1.14356887, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.0850905540992329, + "language_loss": 0.95842957, + "learning_rate": 0.0009941378918223844, + "loss": 0.97018182, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.31616211, + "step": 402, + "time_per_iteration": 3.098615884780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204282, + "balance_loss_mlp": 1.17018807, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.09392353942632323, + "language_loss": 0.9004057, + "learning_rate": 0.0009940902298360354, + "loss": 0.91244853, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34130859, + "step": 403, + "time_per_iteration": 2.769843101501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188661, + "balance_loss_mlp": 1.15563989, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.0817674600565604, + "language_loss": 0.98311555, + "learning_rate": 0.0009940423760268473, + "loss": 0.99500215, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.33007812, + "step": 404, + "time_per_iteration": 2.8945982456207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187921, + "balance_loss_mlp": 1.15442348, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.0859899885976376, + "language_loss": 0.92015374, + "learning_rate": 0.0009939943304133982, + "loss": 0.93203294, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.33496094, + "step": 405, + "time_per_iteration": 2.649461269378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172828, + "balance_loss_mlp": 1.14228618, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.07444114263212052, + "language_loss": 0.99398023, + "learning_rate": 0.0009939460930143416, + "loss": 1.00570846, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.30517578, + "step": 406, + "time_per_iteration": 2.667829990386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181986, + "balance_loss_mlp": 1.15091991, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.08442820151342731, + "language_loss": 0.93529546, + "learning_rate": 0.0009938976638484043, + "loss": 0.9471153, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.31054688, + "step": 407, + "time_per_iteration": 2.9581079483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184799, + "balance_loss_mlp": 1.15428162, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.08907940163556441, + "language_loss": 0.91453135, + "learning_rate": 0.0009938490429343887, + "loss": 0.92637932, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.30493164, + "step": 408, + "time_per_iteration": 2.6066792011260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198916, + "balance_loss_mlp": 1.16708684, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.09407218950155852, + "language_loss": 0.92654747, + "learning_rate": 0.0009938002302911709, + "loss": 0.93853664, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.31835938, + "step": 409, + "time_per_iteration": 2.7762253284454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206718, + "balance_loss_mlp": 1.17415047, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.10932104394797525, + "language_loss": 0.95012206, + "learning_rate": 0.0009937512259377015, + "loss": 0.96218926, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.32543945, + "step": 410, + "time_per_iteration": 2.7103991508483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265174, + "balance_loss_mlp": 1.23193812, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.08720536696991275, + "language_loss": 0.94637173, + "learning_rate": 0.000993702029893006, + "loss": 0.95902348, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.33251953, + "step": 411, + "time_per_iteration": 2.78560733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295676, + "balance_loss_mlp": 1.26029515, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.11720891364975168, + "language_loss": 0.93816972, + "learning_rate": 0.0009936526421761838, + "loss": 0.95112646, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.35400391, + "step": 412, + "time_per_iteration": 3.049868583679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128071, + "balance_loss_mlp": 1.24611533, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.095587468789244, + "language_loss": 0.96658343, + "learning_rate": 0.000993603062806409, + "loss": 0.9793905, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.34619141, + "step": 413, + "time_per_iteration": 2.6881110668182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262528, + "balance_loss_mlp": 1.22843432, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.10701391534122558, + "language_loss": 0.98645592, + "learning_rate": 0.0009935532918029298, + "loss": 0.99908125, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.34082031, + "step": 414, + "time_per_iteration": 2.6234540939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253352, + "balance_loss_mlp": 1.21847153, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.10153449079868698, + "language_loss": 0.92723763, + "learning_rate": 0.0009935033291850694, + "loss": 0.93977106, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.34887695, + "step": 415, + "time_per_iteration": 2.6565287113189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224774, + "balance_loss_mlp": 1.19258738, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09081981361814888, + "language_loss": 0.94647777, + "learning_rate": 0.0009934531749722247, + "loss": 0.95872557, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.32177734, + "step": 416, + "time_per_iteration": 2.6504123210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214952, + "balance_loss_mlp": 1.18243122, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.08798076505254328, + "language_loss": 0.92810607, + "learning_rate": 0.0009934028291838672, + "loss": 0.94025552, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.32495117, + "step": 417, + "time_per_iteration": 2.7589621543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202827, + "balance_loss_mlp": 1.17018712, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.08954115452054644, + "language_loss": 0.88617092, + "learning_rate": 0.0009933522918395433, + "loss": 0.8981992, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.32592773, + "step": 418, + "time_per_iteration": 2.713758707046509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01389029, + "balance_loss_mlp": 1.35851097, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.08425204298586858, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79640126, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.3046875, + "step": 419, + "time_per_iteration": 4.9331464767456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218909, + "balance_loss_mlp": 1.18479085, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.11622805941353512, + "language_loss": 1.05362594, + "learning_rate": 0.000993250642561551, + "loss": 1.06581497, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.34106445, + "step": 420, + "time_per_iteration": 2.672421455383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181293, + "balance_loss_mlp": 1.14843905, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.10269562775159036, + "language_loss": 0.92318636, + "learning_rate": 0.0009931995306673466, + "loss": 0.93499923, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.32861328, + "step": 421, + "time_per_iteration": 2.7427923679351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168977, + "balance_loss_mlp": 1.13657558, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.11431346275656909, + "language_loss": 0.97376955, + "learning_rate": 0.000993148227296103, + "loss": 0.98545933, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.32397461, + "step": 422, + "time_per_iteration": 2.675947666168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151197, + "balance_loss_mlp": 1.12122786, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.0890704687176176, + "language_loss": 0.860506, + "learning_rate": 0.000993096732467738, + "loss": 0.87201798, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.29956055, + "step": 423, + "time_per_iteration": 3.060911178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_mlp": 1.15089989, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.1141297149032614, + "language_loss": 0.91752422, + "learning_rate": 0.0009930450462022435, + "loss": 0.9293741, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.34106445, + "step": 424, + "time_per_iteration": 2.8769121170043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233579, + "balance_loss_mlp": 1.20020068, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.046425192010764525, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80423385, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.33398438, + "step": 425, + "time_per_iteration": 4.897430896759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206307, + "balance_loss_mlp": 1.17078233, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.08757679589662427, + "language_loss": 0.89939743, + "learning_rate": 0.0009929410994402065, + "loss": 0.91146052, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35522461, + "step": 426, + "time_per_iteration": 3.8015847206115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247446, + "balance_loss_mlp": 1.21072912, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.09694830127406533, + "language_loss": 0.94969749, + "learning_rate": 0.0009928888389840196, + "loss": 0.96217191, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3671875, + "step": 427, + "time_per_iteration": 2.7042434215545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244821, + "balance_loss_mlp": 1.21010745, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.09892511285502391, + "language_loss": 0.97471511, + "learning_rate": 0.0009928363871714147, + "loss": 0.98716331, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.34716797, + "step": 428, + "time_per_iteration": 2.6848952770233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253097, + "balance_loss_mlp": 1.21733463, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.08269527052289877, + "language_loss": 0.91760862, + "learning_rate": 0.0009927837440227556, + "loss": 0.9301396, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.35766602, + "step": 429, + "time_per_iteration": 2.838052749633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215238, + "balance_loss_mlp": 1.18357563, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.07794556654442977, + "language_loss": 0.88257664, + "learning_rate": 0.0009927309095584798, + "loss": 0.89472902, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.31640625, + "step": 430, + "time_per_iteration": 3.010039806365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212644, + "balance_loss_mlp": 1.18246055, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.10632891775269031, + "language_loss": 0.96743113, + "learning_rate": 0.0009926778837991, + "loss": 0.97955757, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.30175781, + "step": 431, + "time_per_iteration": 2.734591484069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182523, + "balance_loss_mlp": 1.15226734, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.09435654496071201, + "language_loss": 0.9613564, + "learning_rate": 0.000992624666765202, + "loss": 0.97318161, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.30249023, + "step": 432, + "time_per_iteration": 2.829540252685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164349, + "balance_loss_mlp": 1.13523841, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.09286672234440549, + "language_loss": 0.93021452, + "learning_rate": 0.000992571258477447, + "loss": 0.94185793, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.29101562, + "step": 433, + "time_per_iteration": 2.8295536041259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154694, + "balance_loss_mlp": 1.12720466, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.10037104501236055, + "language_loss": 0.88638759, + "learning_rate": 0.0009925176589565695, + "loss": 0.89793456, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.27514648, + "step": 434, + "time_per_iteration": 2.8025705814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164119, + "balance_loss_mlp": 1.13445985, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.1154039733497609, + "language_loss": 0.97325677, + "learning_rate": 0.0009924638682233791, + "loss": 0.98489797, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.29663086, + "step": 435, + "time_per_iteration": 2.576300621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175635, + "balance_loss_mlp": 1.14626217, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.058007479940938765, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80740231, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.29296875, + "step": 436, + "time_per_iteration": 4.615980625152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203285, + "balance_loss_mlp": 1.17262459, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10734010742427191, + "language_loss": 0.87080061, + "learning_rate": 0.0009923557132036668, + "loss": 0.88283348, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.30664062, + "step": 437, + "time_per_iteration": 3.098910331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203826, + "balance_loss_mlp": 1.1721158, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.10713326361470918, + "language_loss": 0.92728174, + "learning_rate": 0.0009923013489591345, + "loss": 0.93932003, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.31713867, + "step": 438, + "time_per_iteration": 2.7423956394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198894, + "balance_loss_mlp": 1.16902053, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.10035753440716286, + "language_loss": 0.90567303, + "learning_rate": 0.0009922467935862681, + "loss": 0.91766196, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.29882812, + "step": 439, + "time_per_iteration": 3.1101534366607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205509, + "balance_loss_mlp": 1.17477679, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.11954514685823285, + "language_loss": 0.93942809, + "learning_rate": 0.0009921920471062478, + "loss": 0.95148319, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.30712891, + "step": 440, + "time_per_iteration": 2.600698947906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120422, + "balance_loss_mlp": 1.1727252, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.09556707126690236, + "language_loss": 0.90983319, + "learning_rate": 0.0009921371095403281, + "loss": 0.92187542, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.31518555, + "step": 441, + "time_per_iteration": 2.6733319759368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.19223797, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.07797244609311368, + "language_loss": 0.93788469, + "learning_rate": 0.0009920819809098379, + "loss": 0.95012105, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3137207, + "step": 442, + "time_per_iteration": 2.6183252334594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225883, + "balance_loss_mlp": 1.1949122, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.09885461493176821, + "language_loss": 0.89838576, + "learning_rate": 0.0009920266612361798, + "loss": 0.91064465, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.30957031, + "step": 443, + "time_per_iteration": 2.8172709941864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226855, + "balance_loss_mlp": 1.19721913, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.0888891387256682, + "language_loss": 0.90358502, + "learning_rate": 0.0009919711505408308, + "loss": 0.91585356, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.29614258, + "step": 444, + "time_per_iteration": 2.8260107040405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210635, + "balance_loss_mlp": 1.17978323, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.08298354336382399, + "language_loss": 0.88123727, + "learning_rate": 0.000991915448845342, + "loss": 0.89334357, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.30810547, + "step": 445, + "time_per_iteration": 2.5825653076171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189896, + "balance_loss_mlp": 1.16185772, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.079307281997536, + "language_loss": 0.97017783, + "learning_rate": 0.000991859556171339, + "loss": 0.98207676, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.28027344, + "step": 446, + "time_per_iteration": 2.60908579826355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169669, + "balance_loss_mlp": 1.14200044, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.12218297197997938, + "language_loss": 0.98194999, + "learning_rate": 0.000991803472540521, + "loss": 0.99364674, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.27648926, + "step": 447, + "time_per_iteration": 2.6712088584899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151986, + "balance_loss_mlp": 1.12646365, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.09227172547062512, + "language_loss": 0.94125748, + "learning_rate": 0.0009917471979746615, + "loss": 0.95277739, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.25549316, + "step": 448, + "time_per_iteration": 3.075975179672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168815, + "balance_loss_mlp": 1.1426959, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.08941799521583026, + "language_loss": 0.93856514, + "learning_rate": 0.0009916907324956086, + "loss": 0.95025325, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.26123047, + "step": 449, + "time_per_iteration": 2.736283540725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172223, + "balance_loss_mlp": 1.14490044, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.10083399298029862, + "language_loss": 0.89324713, + "learning_rate": 0.0009916340761252837, + "loss": 0.90496939, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.27331543, + "step": 450, + "time_per_iteration": 2.7378761768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159685, + "balance_loss_mlp": 1.13442445, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08549336336253632, + "language_loss": 0.87181985, + "learning_rate": 0.0009915772288856832, + "loss": 0.88341665, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.25268555, + "step": 451, + "time_per_iteration": 3.0766491889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155631, + "balance_loss_mlp": 1.12976265, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07927995723527953, + "language_loss": 0.88654345, + "learning_rate": 0.000991520190798877, + "loss": 0.89809978, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.2590332, + "step": 452, + "time_per_iteration": 2.838925838470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.13122344, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.12430534270573573, + "language_loss": 0.96291733, + "learning_rate": 0.0009914629618870089, + "loss": 0.97449821, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.26904297, + "step": 453, + "time_per_iteration": 2.902444839477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103846, + "balance_loss_mlp": 1.0800997, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.040702290127782634, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.7977972, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.23730469, + "step": 454, + "time_per_iteration": 4.758902072906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089921, + "balance_loss_mlp": 1.06579328, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.037925831915212815, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.8251788, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.24121094, + "step": 455, + "time_per_iteration": 4.819866180419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230669, + "balance_loss_mlp": 1.19860172, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.12072891758744606, + "language_loss": 0.9005816, + "learning_rate": 0.0009912901304235883, + "loss": 0.91288829, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.32055664, + "step": 456, + "time_per_iteration": 2.928392171859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251391, + "balance_loss_mlp": 1.21851277, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.11610109334616998, + "language_loss": 0.85792667, + "learning_rate": 0.000991232138434397, + "loss": 0.8704406, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.32885742, + "step": 457, + "time_per_iteration": 2.868086099624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268634, + "balance_loss_mlp": 1.23406374, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.1267050228562, + "language_loss": 0.92359412, + "learning_rate": 0.000991173955731976, + "loss": 0.93628043, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.34594727, + "step": 458, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225195, + "balance_loss_mlp": 1.19374788, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.08225728813848474, + "language_loss": 0.98437196, + "learning_rate": 0.0009911155823389137, + "loss": 0.99662387, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.31445312, + "step": 459, + "time_per_iteration": 3.052828550338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208858, + "balance_loss_mlp": 1.17938948, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.06750279925545952, + "language_loss": 0.93789524, + "learning_rate": 0.000991057018277873, + "loss": 0.94998378, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.29467773, + "step": 460, + "time_per_iteration": 2.7062363624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175577, + "balance_loss_mlp": 1.14656162, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.09934743705595177, + "language_loss": 0.93365753, + "learning_rate": 0.0009909982635715898, + "loss": 0.94541329, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.28979492, + "step": 461, + "time_per_iteration": 2.647963523864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163614, + "balance_loss_mlp": 1.13576651, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.09828505426249505, + "language_loss": 0.93045211, + "learning_rate": 0.0009909393182428751, + "loss": 0.94208831, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.27856445, + "step": 462, + "time_per_iteration": 2.6743412017822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163317, + "balance_loss_mlp": 1.13556552, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.08889819955039441, + "language_loss": 0.88051188, + "learning_rate": 0.000990880182314614, + "loss": 0.89214504, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.27758789, + "step": 463, + "time_per_iteration": 2.703216314315796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163709, + "balance_loss_mlp": 1.1364336, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.07282859671945509, + "language_loss": 0.89247352, + "learning_rate": 0.0009908208558097643, + "loss": 0.90411055, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.27319336, + "step": 464, + "time_per_iteration": 2.9412851333618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011721, + "balance_loss_mlp": 1.14410961, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.07278927788912996, + "language_loss": 0.90032935, + "learning_rate": 0.000990761338751359, + "loss": 0.91205037, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.2800293, + "step": 465, + "time_per_iteration": 2.7876837253570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_mlp": 1.0181731, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.02426695301026172, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74698591, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20800781, + "step": 466, + "time_per_iteration": 5.05983304977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189569, + "balance_loss_mlp": 1.16098237, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.07846733746050528, + "language_loss": 0.9248395, + "learning_rate": 0.0009906417330663815, + "loss": 0.93673521, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.28588867, + "step": 467, + "time_per_iteration": 2.696319103240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194386, + "balance_loss_mlp": 1.16539454, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.08323950534675657, + "language_loss": 0.88480067, + "learning_rate": 0.0009905816444862442, + "loss": 0.89674455, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.29003906, + "step": 468, + "time_per_iteration": 2.6381607055664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218295, + "balance_loss_mlp": 1.18875456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.07740224213463104, + "language_loss": 0.8706888, + "learning_rate": 0.0009905213654454216, + "loss": 0.88287175, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.29516602, + "step": 469, + "time_per_iteration": 2.9251277446746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229446, + "balance_loss_mlp": 1.19940567, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.08990381668478556, + "language_loss": 0.94001997, + "learning_rate": 0.0009904608959673158, + "loss": 0.95231444, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.30053711, + "step": 470, + "time_per_iteration": 2.812967538833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247261, + "balance_loss_mlp": 1.21679068, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.12209234788536222, + "language_loss": 0.92894399, + "learning_rate": 0.000990400236075403, + "loss": 0.94141662, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.3046875, + "step": 471, + "time_per_iteration": 2.5002622604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205406, + "balance_loss_mlp": 1.17622375, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.10180872621251921, + "language_loss": 0.91581351, + "learning_rate": 0.0009903393857932338, + "loss": 0.92786753, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.29150391, + "step": 472, + "time_per_iteration": 2.656669855117798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119747, + "balance_loss_mlp": 1.16866922, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.09565392288785843, + "language_loss": 0.88802767, + "learning_rate": 0.0009902783451444317, + "loss": 0.90000236, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.28808594, + "step": 473, + "time_per_iteration": 2.769510269165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118613, + "balance_loss_mlp": 1.16034544, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.10259894411844421, + "language_loss": 0.94123209, + "learning_rate": 0.0009902171141526956, + "loss": 0.95309335, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.25769043, + "step": 474, + "time_per_iteration": 2.523611545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119701, + "balance_loss_mlp": 1.17120147, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.11667434950480311, + "language_loss": 0.82319391, + "learning_rate": 0.000990155692841797, + "loss": 0.83516395, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.25817871, + "step": 475, + "time_per_iteration": 2.9675121307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227501, + "balance_loss_mlp": 1.20134616, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.09682112540143008, + "language_loss": 0.93477046, + "learning_rate": 0.0009900940812355818, + "loss": 0.94704551, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.26147461, + "step": 476, + "time_per_iteration": 2.924874782562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242063, + "balance_loss_mlp": 1.21519351, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.10139353171378648, + "language_loss": 0.88050354, + "learning_rate": 0.00099003227935797, + "loss": 0.89292419, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.26879883, + "step": 477, + "time_per_iteration": 2.7283573150634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261302, + "balance_loss_mlp": 1.23314476, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.08348382552066277, + "language_loss": 0.91095632, + "learning_rate": 0.000989970287232955, + "loss": 0.92356932, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.28149414, + "step": 478, + "time_per_iteration": 2.8266103267669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241973, + "balance_loss_mlp": 1.21583056, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.10737558840696987, + "language_loss": 0.89902192, + "learning_rate": 0.0009899081048846043, + "loss": 0.91144162, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.26135254, + "step": 479, + "time_per_iteration": 2.6420280933380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291965, + "balance_loss_mlp": 1.26427281, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.10012000168567356, + "language_loss": 0.93502498, + "learning_rate": 0.0009898457323370593, + "loss": 0.94794464, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.27697754, + "step": 480, + "time_per_iteration": 2.6065309047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246143, + "balance_loss_mlp": 1.21961892, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.0993880337212747, + "language_loss": 0.92708224, + "learning_rate": 0.000989783169614535, + "loss": 0.93954372, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.26525879, + "step": 481, + "time_per_iteration": 2.7099456787109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079427, + "balance_loss_mlp": 1.05930424, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.03505173716607146, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79832184, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.20117188, + "step": 482, + "time_per_iteration": 4.890375852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231966, + "balance_loss_mlp": 1.20573974, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10137363964482546, + "language_loss": 0.90139151, + "learning_rate": 0.000989657473741779, + "loss": 0.91371119, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.2623291, + "step": 483, + "time_per_iteration": 2.9370570182800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207368, + "balance_loss_mlp": 1.18004489, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.08498377120371232, + "language_loss": 0.9143101, + "learning_rate": 0.0009895943406403465, + "loss": 0.92638373, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.2734375, + "step": 484, + "time_per_iteration": 2.7508950233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207641, + "balance_loss_mlp": 1.17798209, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.09176142665566275, + "language_loss": 0.84377563, + "learning_rate": 0.0009895310174615338, + "loss": 0.85585213, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.29638672, + "step": 485, + "time_per_iteration": 2.785452365875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111377, + "balance_loss_mlp": 1.09211314, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.060723434374539316, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76829892, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.19238281, + "step": 486, + "time_per_iteration": 4.6911780834198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119428, + "balance_loss_mlp": 1.16636121, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.10396612544825783, + "language_loss": 0.89653724, + "learning_rate": 0.0009894038009701782, + "loss": 0.90848005, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.27954102, + "step": 487, + "time_per_iteration": 2.6375234127044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240132, + "balance_loss_mlp": 1.20847011, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.09761423787564506, + "language_loss": 0.88893723, + "learning_rate": 0.0009893399077070253, + "loss": 0.90133858, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31640625, + "step": 488, + "time_per_iteration": 2.63673734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217529, + "balance_loss_mlp": 1.18844151, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.08578938939363263, + "language_loss": 0.87286389, + "learning_rate": 0.0009892758244652718, + "loss": 0.88503921, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29077148, + "step": 489, + "time_per_iteration": 2.6579813957214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226074, + "balance_loss_mlp": 1.19698668, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.10664482488995004, + "language_loss": 0.91801828, + "learning_rate": 0.0009892115512697968, + "loss": 0.93027902, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.29101562, + "step": 490, + "time_per_iteration": 2.744812250137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208984, + "balance_loss_mlp": 1.18106508, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.07150484911777356, + "language_loss": 0.94226933, + "learning_rate": 0.0009891470881455537, + "loss": 0.95435917, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.27905273, + "step": 491, + "time_per_iteration": 2.7888436317443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184692, + "balance_loss_mlp": 1.15854979, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08034794474061628, + "language_loss": 0.91272295, + "learning_rate": 0.0009890824351175692, + "loss": 0.92456985, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.26184082, + "step": 492, + "time_per_iteration": 2.6893324851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168375, + "balance_loss_mlp": 1.1430074, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.11413207975143042, + "language_loss": 0.96479064, + "learning_rate": 0.0009890175922109435, + "loss": 0.9764744, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.25378418, + "step": 493, + "time_per_iteration": 2.678849935531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184878, + "balance_loss_mlp": 1.15874791, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.08018179898504754, + "language_loss": 0.9392823, + "learning_rate": 0.0009889525594508513, + "loss": 0.95113099, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.26147461, + "step": 494, + "time_per_iteration": 3.067603349685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118757, + "balance_loss_mlp": 1.16171312, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.06605208103828443, + "language_loss": 0.88701022, + "learning_rate": 0.0009888873368625404, + "loss": 0.89888591, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.25891113, + "step": 495, + "time_per_iteration": 2.513042688369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208336, + "balance_loss_mlp": 1.18301558, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.21045205282495727, + "language_loss": 0.923554, + "learning_rate": 0.0009888219244713326, + "loss": 0.93563735, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.2532959, + "step": 496, + "time_per_iteration": 2.867083787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121286, + "balance_loss_mlp": 1.18638349, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.11531388313037762, + "language_loss": 0.9129262, + "learning_rate": 0.0009887563223026229, + "loss": 0.92505479, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.26501465, + "step": 497, + "time_per_iteration": 2.708878993988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251726, + "balance_loss_mlp": 1.23503661, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.1018924396807409, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80319893, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.16699219, + "step": 498, + "time_per_iteration": 4.9335105419158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203662, + "balance_loss_mlp": 1.1767087, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.15301104951645897, + "language_loss": 0.9155978, + "learning_rate": 0.0009886245487346482, + "loss": 0.92763442, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.2701416, + "step": 499, + "time_per_iteration": 3.048356771469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012374, + "balance_loss_mlp": 1.20936203, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.11445293306924414, + "language_loss": 0.93613195, + "learning_rate": 0.0009885583773865422, + "loss": 0.948506, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.28076172, + "step": 500, + "time_per_iteration": 2.4812135696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124857, + "balance_loss_mlp": 1.21948266, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.08673144300895683, + "language_loss": 0.91201293, + "learning_rate": 0.0009884920163632524, + "loss": 0.92449856, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2911377, + "step": 501, + "time_per_iteration": 2.6971659660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267642, + "balance_loss_mlp": 1.23836422, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.09557615258578338, + "language_loss": 0.9327184, + "learning_rate": 0.000988425465690543, + "loss": 0.94539481, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.29296875, + "step": 502, + "time_per_iteration": 2.6156561374664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125655, + "balance_loss_mlp": 1.22724855, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.09431767215346384, + "language_loss": 0.90255487, + "learning_rate": 0.0009883587253942505, + "loss": 0.91512042, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.29284668, + "step": 503, + "time_per_iteration": 2.8239471912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284099, + "balance_loss_mlp": 1.25420117, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.11394137891765209, + "language_loss": 0.96597123, + "learning_rate": 0.0009882917955002862, + "loss": 0.97881228, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.29907227, + "step": 504, + "time_per_iteration": 2.603328227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229149, + "balance_loss_mlp": 1.2021842, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.09281538791599028, + "language_loss": 0.89550316, + "learning_rate": 0.0009882246760346343, + "loss": 0.90779471, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.26977539, + "step": 505, + "time_per_iteration": 2.681687831878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229765, + "balance_loss_mlp": 1.20144057, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.10637320281066408, + "language_loss": 0.9312228, + "learning_rate": 0.0009881573670233533, + "loss": 0.94352043, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.28295898, + "step": 506, + "time_per_iteration": 2.5317869186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210552, + "balance_loss_mlp": 1.18480229, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.08668641437707587, + "language_loss": 0.88418829, + "learning_rate": 0.0009880898684925747, + "loss": 0.89629376, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.25769043, + "step": 507, + "time_per_iteration": 2.7037086486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171606, + "balance_loss_mlp": 1.14662004, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09301682260046856, + "language_loss": 0.8754462, + "learning_rate": 0.0009880221804685037, + "loss": 0.88716233, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.24987793, + "step": 508, + "time_per_iteration": 2.5904412269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132812, + "balance_loss_mlp": 1.11431122, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.05770369236985839, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80477232, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18457031, + "step": 509, + "time_per_iteration": 4.754728317260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155131, + "balance_loss_mlp": 1.12993002, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.08546011105886044, + "language_loss": 0.93283963, + "learning_rate": 0.0009878862360456733, + "loss": 0.94439089, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.25219727, + "step": 510, + "time_per_iteration": 2.7473011016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139729, + "balance_loss_mlp": 1.11480212, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.09364527364696289, + "language_loss": 0.86814249, + "learning_rate": 0.0009878179796996922, + "loss": 0.87953973, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.24926758, + "step": 511, + "time_per_iteration": 2.74253249168396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157281, + "balance_loss_mlp": 1.13087618, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.0728025857811697, + "language_loss": 0.90271652, + "learning_rate": 0.0009877495339659754, + "loss": 0.91428936, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.26428223, + "step": 512, + "time_per_iteration": 2.7383904457092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011795, + "balance_loss_mlp": 1.15373945, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.08851969035528326, + "language_loss": 0.84944135, + "learning_rate": 0.000987680898871096, + "loss": 0.86123633, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "step": 513, + "time_per_iteration": 2.7576277256011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213945, + "balance_loss_mlp": 1.18686032, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10650793826837307, + "language_loss": 0.85207206, + "learning_rate": 0.0009876120744417, + "loss": 0.8642115, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.27075195, + "step": 514, + "time_per_iteration": 2.9868528842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205226, + "balance_loss_mlp": 1.17891693, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.12423818842648264, + "language_loss": 0.94048339, + "learning_rate": 0.0009875430607045078, + "loss": 0.95253563, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2635498, + "step": 515, + "time_per_iteration": 2.6809887886047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217575, + "balance_loss_mlp": 1.19226718, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.09121928261100491, + "language_loss": 0.90633368, + "learning_rate": 0.000987473857686313, + "loss": 0.91850942, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.2532959, + "step": 516, + "time_per_iteration": 2.821868896484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273678, + "balance_loss_mlp": 1.24556851, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.10235865570139392, + "language_loss": 0.92397732, + "learning_rate": 0.0009874044654139824, + "loss": 0.93671417, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.28125, + "step": 517, + "time_per_iteration": 2.754556894302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269361, + "balance_loss_mlp": 1.24070311, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.1033638080855083, + "language_loss": 0.91346741, + "learning_rate": 0.0009873348839144563, + "loss": 0.92616105, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.28662109, + "step": 518, + "time_per_iteration": 2.5521421432495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223264, + "balance_loss_mlp": 1.19750261, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.08349046242237956, + "language_loss": 0.9484781, + "learning_rate": 0.000987265113214749, + "loss": 0.96071064, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.25793457, + "step": 519, + "time_per_iteration": 2.5728440284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209741, + "balance_loss_mlp": 1.18294215, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.0925674463481217, + "language_loss": 0.93808675, + "learning_rate": 0.0009871951533419476, + "loss": 0.95018411, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.26794434, + "step": 520, + "time_per_iteration": 2.720158576965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173015, + "balance_loss_mlp": 1.14725351, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.08576102326010304, + "language_loss": 0.87117791, + "learning_rate": 0.0009871250043232132, + "loss": 0.88290811, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.25769043, + "step": 521, + "time_per_iteration": 2.7765281200408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167625, + "balance_loss_mlp": 1.14231658, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08176746103179605, + "language_loss": 0.85016751, + "learning_rate": 0.0009870546661857797, + "loss": 0.86184376, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.25317383, + "step": 522, + "time_per_iteration": 2.621741771697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192442, + "balance_loss_mlp": 1.16581106, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.1034937566099096, + "language_loss": 0.93671012, + "learning_rate": 0.0009869841389569553, + "loss": 0.94863456, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.26647949, + "step": 523, + "time_per_iteration": 2.9877190589904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176234, + "balance_loss_mlp": 1.15106893, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.09839184495226623, + "language_loss": 0.87745041, + "learning_rate": 0.0009869134226641206, + "loss": 0.88921273, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.25170898, + "step": 524, + "time_per_iteration": 2.5881335735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182922, + "balance_loss_mlp": 1.15635061, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.08405321822424026, + "language_loss": 0.86857122, + "learning_rate": 0.0009868425173347303, + "loss": 0.88040042, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.26599121, + "step": 525, + "time_per_iteration": 2.66532301902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171089, + "balance_loss_mlp": 1.14556646, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.08405786654151125, + "language_loss": 0.94851571, + "learning_rate": 0.0009867714229963125, + "loss": 0.96022666, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.25549316, + "step": 526, + "time_per_iteration": 2.8129477500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180551, + "balance_loss_mlp": 1.15515995, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.0887042459069511, + "language_loss": 0.92144597, + "learning_rate": 0.000986700139676468, + "loss": 0.93325144, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.25402832, + "step": 527, + "time_per_iteration": 2.5864803791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221342, + "balance_loss_mlp": 1.19498479, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.0908626798732068, + "language_loss": 0.89802891, + "learning_rate": 0.0009866286674028717, + "loss": 0.91024232, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.26379395, + "step": 528, + "time_per_iteration": 2.6321539878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195715, + "balance_loss_mlp": 1.1701684, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.10105960014250041, + "language_loss": 0.86296791, + "learning_rate": 0.0009865570062032717, + "loss": 0.87492502, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.25561523, + "step": 529, + "time_per_iteration": 2.9451780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180062, + "balance_loss_mlp": 1.15431321, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.07153867300670864, + "language_loss": 0.9169668, + "learning_rate": 0.0009864851561054893, + "loss": 0.92876744, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.25756836, + "step": 530, + "time_per_iteration": 2.829380512237549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138967, + "balance_loss_mlp": 1.11554205, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.07949334936814403, + "language_loss": 0.90603149, + "learning_rate": 0.0009864131171374191, + "loss": 0.9174211, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23413086, + "step": 531, + "time_per_iteration": 2.7103002071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144785, + "balance_loss_mlp": 1.12042999, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.09480674153197077, + "language_loss": 0.89899409, + "learning_rate": 0.0009863408893270292, + "loss": 0.91044188, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.24353027, + "step": 532, + "time_per_iteration": 2.800015926361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135275, + "balance_loss_mlp": 1.11101604, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.12452848702407365, + "language_loss": 0.84814823, + "learning_rate": 0.0009862684727023605, + "loss": 0.85950094, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.24243164, + "step": 533, + "time_per_iteration": 2.733250856399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142831, + "balance_loss_mlp": 1.11813033, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10251703935298907, + "language_loss": 0.88274956, + "learning_rate": 0.0009861958672915283, + "loss": 0.89417779, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.24707031, + "step": 534, + "time_per_iteration": 2.8380610942840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151783, + "balance_loss_mlp": 1.12847757, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.08316309975829886, + "language_loss": 0.88756025, + "learning_rate": 0.0009861230731227201, + "loss": 0.89907813, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.23291016, + "step": 535, + "time_per_iteration": 2.871997594833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188729, + "balance_loss_mlp": 1.16410041, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.08198011669981227, + "language_loss": 0.89923763, + "learning_rate": 0.0009860500902241973, + "loss": 0.91112483, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.24633789, + "step": 536, + "time_per_iteration": 2.623779058456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200536, + "balance_loss_mlp": 1.17560923, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.07805911222058415, + "language_loss": 0.94478881, + "learning_rate": 0.0009859769186242942, + "loss": 0.95679414, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.24914551, + "step": 537, + "time_per_iteration": 2.580596923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237375, + "balance_loss_mlp": 1.21290088, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.07373890024349967, + "language_loss": 0.87774181, + "learning_rate": 0.0009859035583514187, + "loss": 0.89011556, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.24450684, + "step": 538, + "time_per_iteration": 2.6570377349853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278522, + "balance_loss_mlp": 1.25283265, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.09282522264822365, + "language_loss": 0.89254487, + "learning_rate": 0.0009858300094340517, + "loss": 0.90533006, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.25720215, + "step": 539, + "time_per_iteration": 2.787065267562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129717, + "balance_loss_mlp": 1.271981, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.12009350418361847, + "language_loss": 0.84273541, + "learning_rate": 0.0009857562719007473, + "loss": 0.85570705, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.2520752, + "step": 540, + "time_per_iteration": 2.60508394241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269758, + "balance_loss_mlp": 1.24520063, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.09915993306854447, + "language_loss": 0.86265039, + "learning_rate": 0.0009856823457801331, + "loss": 0.87534791, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.24560547, + "step": 541, + "time_per_iteration": 2.916395664215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200462, + "balance_loss_mlp": 1.17673898, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08980852435022621, + "language_loss": 0.93430036, + "learning_rate": 0.00098560823110091, + "loss": 0.94630498, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.23718262, + "step": 542, + "time_per_iteration": 2.6473944187164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011666, + "balance_loss_mlp": 1.14424849, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.09857064774686858, + "language_loss": 0.94166034, + "learning_rate": 0.000985533927891851, + "loss": 0.95332634, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22338867, + "step": 543, + "time_per_iteration": 2.7833001613616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152313, + "balance_loss_mlp": 1.13023496, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.11299567756475092, + "language_loss": 0.91803026, + "learning_rate": 0.0009854594361818044, + "loss": 0.92955339, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.22070312, + "step": 544, + "time_per_iteration": 2.7342488765716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145926, + "balance_loss_mlp": 1.12322879, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.10706082764174026, + "language_loss": 0.90779245, + "learning_rate": 0.0009853847559996897, + "loss": 0.91925174, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22680664, + "step": 545, + "time_per_iteration": 2.7671496868133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115478, + "balance_loss_mlp": 1.13199878, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.09298705322285353, + "language_loss": 0.90420544, + "learning_rate": 0.0009853098873745, + "loss": 0.91575325, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.2277832, + "step": 546, + "time_per_iteration": 3.0312061309814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114995, + "balance_loss_mlp": 1.12715745, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.08666617811450783, + "language_loss": 0.89437926, + "learning_rate": 0.0009852348303353027, + "loss": 0.90587872, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22802734, + "step": 547, + "time_per_iteration": 2.8053338527679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175894, + "balance_loss_mlp": 1.15260065, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.07202142444648872, + "language_loss": 0.8282218, + "learning_rate": 0.000985159584911237, + "loss": 0.83998078, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.23291016, + "step": 548, + "time_per_iteration": 3.168396472930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200066, + "balance_loss_mlp": 1.17569995, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.0989690478360349, + "language_loss": 0.89268672, + "learning_rate": 0.0009850841511315162, + "loss": 0.9046874, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.24365234, + "step": 549, + "time_per_iteration": 2.6511220932006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205335, + "balance_loss_mlp": 1.18058681, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.10906170470493136, + "language_loss": 0.90274942, + "learning_rate": 0.0009850085290254256, + "loss": 0.91480273, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.24755859, + "step": 550, + "time_per_iteration": 2.8123652935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166073, + "balance_loss_mlp": 1.14193285, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.06887070936512274, + "language_loss": 0.8779422, + "learning_rate": 0.0009849327186223246, + "loss": 0.88960296, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.24121094, + "step": 551, + "time_per_iteration": 2.780959129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144115, + "balance_loss_mlp": 1.12010658, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.1035499947998288, + "language_loss": 0.94864386, + "learning_rate": 0.000984856719951646, + "loss": 0.96008497, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.23986816, + "step": 552, + "time_per_iteration": 2.599581718444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135304, + "balance_loss_mlp": 1.1112473, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.08131430219430819, + "language_loss": 0.91351348, + "learning_rate": 0.0009847805330428943, + "loss": 0.92486656, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24035645, + "step": 553, + "time_per_iteration": 2.9599480628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126363, + "balance_loss_mlp": 1.1017344, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.10883623187773357, + "language_loss": 0.92631853, + "learning_rate": 0.0009847041579256481, + "loss": 0.93758214, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.24633789, + "step": 554, + "time_per_iteration": 2.592348575592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139312, + "balance_loss_mlp": 1.11518431, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08685206815428315, + "language_loss": 0.94236493, + "learning_rate": 0.0009846275946295592, + "loss": 0.95375812, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.2409668, + "step": 555, + "time_per_iteration": 2.6748178005218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157684, + "balance_loss_mlp": 1.13367498, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.1423144419608042, + "language_loss": 0.86826319, + "learning_rate": 0.0009845508431843518, + "loss": 0.87984002, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23974609, + "step": 556, + "time_per_iteration": 3.0652637481689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188505, + "balance_loss_mlp": 1.16398418, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.08544226719489541, + "language_loss": 0.87931871, + "learning_rate": 0.0009844739036198233, + "loss": 0.89120376, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.24523926, + "step": 557, + "time_per_iteration": 2.667473793029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210589, + "balance_loss_mlp": 1.18594849, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.07677269921737997, + "language_loss": 0.9440788, + "learning_rate": 0.0009843967759658448, + "loss": 0.95618474, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.24658203, + "step": 558, + "time_per_iteration": 2.7628064155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132066, + "balance_loss_mlp": 1.11194348, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.0590422913979422, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73899817, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.20117188, + "step": 559, + "time_per_iteration": 4.902129888534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241703, + "balance_loss_mlp": 1.21570349, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.0867010736609256, + "language_loss": 0.9488945, + "learning_rate": 0.000984241956509384, + "loss": 0.96131158, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.26025391, + "step": 560, + "time_per_iteration": 2.6891891956329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208783, + "balance_loss_mlp": 1.18289042, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08963888455934524, + "language_loss": 0.90658677, + "learning_rate": 0.0009841642647670078, + "loss": 0.91867459, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.25927734, + "step": 561, + "time_per_iteration": 2.563408613204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198224, + "balance_loss_mlp": 1.17229605, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08487676980325562, + "language_loss": 0.85033154, + "learning_rate": 0.0009840863850553944, + "loss": 0.86231375, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.25964355, + "step": 562, + "time_per_iteration": 2.9805734157562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183286, + "balance_loss_mlp": 1.157763, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.08249773787970602, + "language_loss": 0.90893888, + "learning_rate": 0.0009840083174047782, + "loss": 0.92077172, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.25537109, + "step": 563, + "time_per_iteration": 2.7391836643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194315, + "balance_loss_mlp": 1.16986513, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.07051664629026161, + "language_loss": 0.85589021, + "learning_rate": 0.0009839300618454685, + "loss": 0.86783338, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.24438477, + "step": 564, + "time_per_iteration": 2.89290452003479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194246, + "balance_loss_mlp": 1.16989148, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.07367825547097939, + "language_loss": 0.91287452, + "learning_rate": 0.0009838516184078466, + "loss": 0.92481697, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.24353027, + "step": 565, + "time_per_iteration": 2.8416025638580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201114, + "balance_loss_mlp": 1.17573452, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.11472314835583913, + "language_loss": 0.88207066, + "learning_rate": 0.0009837729871223669, + "loss": 0.89408183, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.25402832, + "step": 566, + "time_per_iteration": 2.6492197513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249487, + "balance_loss_mlp": 1.22309399, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.07200956845133732, + "language_loss": 0.88285792, + "learning_rate": 0.0009836941680195568, + "loss": 0.89535284, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.2644043, + "step": 567, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124424, + "balance_loss_mlp": 1.21801353, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.08672877457635139, + "language_loss": 0.83671671, + "learning_rate": 0.0009836151611300166, + "loss": 0.84915912, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.26245117, + "step": 568, + "time_per_iteration": 3.2202959060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232018, + "balance_loss_mlp": 1.2069366, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0737206182188589, + "language_loss": 0.9499715, + "learning_rate": 0.0009835359664844194, + "loss": 0.96229166, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.25097656, + "step": 569, + "time_per_iteration": 2.6723880767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115473, + "balance_loss_mlp": 1.09935594, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.05305645754414589, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82152283, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.16113281, + "step": 570, + "time_per_iteration": 4.934283494949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262159, + "balance_loss_mlp": 1.23583817, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.0759630537733653, + "language_loss": 0.91932368, + "learning_rate": 0.0009833770140481118, + "loss": 0.93194532, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.2635498, + "step": 571, + "time_per_iteration": 2.6325361728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240536, + "balance_loss_mlp": 1.21385729, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.07085220990305834, + "language_loss": 0.82309085, + "learning_rate": 0.000983297256319112, + "loss": 0.83549619, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.26733398, + "step": 572, + "time_per_iteration": 3.230297088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227341, + "balance_loss_mlp": 1.20004177, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.0905445578460947, + "language_loss": 0.86770016, + "learning_rate": 0.000983217310957477, + "loss": 0.87997353, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.27319336, + "step": 573, + "time_per_iteration": 2.8283607959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230404, + "balance_loss_mlp": 1.20267606, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08397098324277796, + "language_loss": 0.89933473, + "learning_rate": 0.000983137177994244, + "loss": 0.91163886, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.27734375, + "step": 574, + "time_per_iteration": 2.945197820663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184809, + "balance_loss_mlp": 1.15805852, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08995501683398337, + "language_loss": 0.85942268, + "learning_rate": 0.0009830568574605235, + "loss": 0.87127078, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.26782227, + "step": 575, + "time_per_iteration": 2.9714908599853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173303, + "balance_loss_mlp": 1.14733911, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.11617237422906017, + "language_loss": 0.87585467, + "learning_rate": 0.0009829763493874992, + "loss": 0.88758773, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.2598877, + "step": 576, + "time_per_iteration": 3.0522892475128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185519, + "balance_loss_mlp": 1.15929341, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.07800734946110352, + "language_loss": 0.92923808, + "learning_rate": 0.0009828956538064264, + "loss": 0.94109321, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.26245117, + "step": 577, + "time_per_iteration": 2.8397951126098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198525, + "balance_loss_mlp": 1.17312193, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07768178407950788, + "language_loss": 0.90871215, + "learning_rate": 0.0009828147707486344, + "loss": 0.92069739, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.25427246, + "step": 578, + "time_per_iteration": 2.714322805404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120727, + "balance_loss_mlp": 1.18262911, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.08360568840749934, + "language_loss": 0.86554426, + "learning_rate": 0.0009827337002455245, + "loss": 0.877617, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.24645996, + "step": 579, + "time_per_iteration": 2.742311477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195367, + "balance_loss_mlp": 1.17049956, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07475116375685303, + "language_loss": 0.87853694, + "learning_rate": 0.0009826524423285712, + "loss": 0.89049065, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.24865723, + "step": 580, + "time_per_iteration": 3.014310121536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212273, + "balance_loss_mlp": 1.18770432, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.09493717034802315, + "language_loss": 0.88884461, + "learning_rate": 0.0009825709970293218, + "loss": 0.90096736, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2454834, + "step": 581, + "time_per_iteration": 3.004209518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215399, + "balance_loss_mlp": 1.19164097, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.0873103144771369, + "language_loss": 0.95079505, + "learning_rate": 0.0009824893643793956, + "loss": 0.96294904, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.23754883, + "step": 582, + "time_per_iteration": 3.0893442630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220934, + "balance_loss_mlp": 1.1956501, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.08836320076119632, + "language_loss": 0.87841964, + "learning_rate": 0.0009824075444104857, + "loss": 0.89062899, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.25280762, + "step": 583, + "time_per_iteration": 2.7537503242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239656, + "balance_loss_mlp": 1.21521807, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.16884309783149784, + "language_loss": 0.93345737, + "learning_rate": 0.000982325537154357, + "loss": 0.94585395, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.24450684, + "step": 584, + "time_per_iteration": 2.59409499168396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211835, + "balance_loss_mlp": 1.18743277, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.08768097982415915, + "language_loss": 0.93578511, + "learning_rate": 0.0009822433426428484, + "loss": 0.94790351, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.24401855, + "step": 585, + "time_per_iteration": 2.581516742706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190012, + "balance_loss_mlp": 1.16627765, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.09638114373338931, + "language_loss": 0.8707509, + "learning_rate": 0.0009821609609078697, + "loss": 0.88265103, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.23730469, + "step": 586, + "time_per_iteration": 2.6160855293273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192266, + "balance_loss_mlp": 1.16885376, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.08368187760832956, + "language_loss": 0.89230156, + "learning_rate": 0.0009820783919814045, + "loss": 0.90422428, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.23425293, + "step": 587, + "time_per_iteration": 2.8534207344055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168851, + "balance_loss_mlp": 1.14552212, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.1429978790264596, + "language_loss": 0.82743758, + "learning_rate": 0.0009819956358955095, + "loss": 0.83912605, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.2331543, + "step": 588, + "time_per_iteration": 2.5901453495025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173293, + "balance_loss_mlp": 1.14966619, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.08588056281957461, + "language_loss": 0.84002471, + "learning_rate": 0.0009819126926823127, + "loss": 0.85175765, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23608398, + "step": 589, + "time_per_iteration": 2.530374765396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202956, + "balance_loss_mlp": 1.17918611, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.07487704505114483, + "language_loss": 0.86892301, + "learning_rate": 0.000981829562374016, + "loss": 0.88095254, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.2376709, + "step": 590, + "time_per_iteration": 2.8030459880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244494, + "balance_loss_mlp": 1.22037804, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.12123010147526934, + "language_loss": 0.97345364, + "learning_rate": 0.0009817462450028933, + "loss": 0.98589861, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.24108887, + "step": 591, + "time_per_iteration": 2.7129569053649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233399, + "balance_loss_mlp": 1.20995021, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.08245604807530345, + "language_loss": 0.85052103, + "learning_rate": 0.0009816627406012916, + "loss": 0.86285496, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.23425293, + "step": 592, + "time_per_iteration": 2.8424665927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218738, + "balance_loss_mlp": 1.19550395, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.128701110372521, + "language_loss": 0.84672415, + "learning_rate": 0.0009815790492016295, + "loss": 0.85891157, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.23217773, + "step": 593, + "time_per_iteration": 2.95451283454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171348, + "balance_loss_mlp": 1.14887691, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.14505795416516268, + "language_loss": 0.86793518, + "learning_rate": 0.0009814951708363993, + "loss": 0.87964857, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.22473145, + "step": 594, + "time_per_iteration": 2.85953950881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125332, + "balance_loss_mlp": 1.11083615, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.044045371588173315, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79116321, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.14453125, + "step": 595, + "time_per_iteration": 4.819102048873901 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116091, + "balance_loss_mlp": 1.09400165, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.15046985558242026, + "language_loss": 0.88265449, + "learning_rate": 0.0009813268533395648, + "loss": 0.8938154, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.2208252, + "step": 596, + "time_per_iteration": 2.5988821983337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127071, + "balance_loss_mlp": 1.10389698, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12036284201424394, + "language_loss": 0.87534207, + "learning_rate": 0.0009812424142733073, + "loss": 0.88661277, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.23168945, + "step": 597, + "time_per_iteration": 2.5434508323669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011341, + "balance_loss_mlp": 1.11084187, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.21736642596268407, + "language_loss": 0.85729969, + "learning_rate": 0.000981157788372175, + "loss": 0.86864072, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.23242188, + "step": 598, + "time_per_iteration": 3.0409185886383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140706, + "balance_loss_mlp": 1.11694789, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.09609751014588512, + "language_loss": 0.89140439, + "learning_rate": 0.0009810729756690223, + "loss": 0.90281147, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.23742676, + "step": 599, + "time_per_iteration": 2.7512025833129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149306, + "balance_loss_mlp": 1.12485611, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09347854332414611, + "language_loss": 0.92009699, + "learning_rate": 0.0009809879761967766, + "loss": 0.93159008, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.24438477, + "step": 600, + "time_per_iteration": 2.966771364212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114736, + "balance_loss_mlp": 1.1223377, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11723124982013416, + "language_loss": 0.86307055, + "learning_rate": 0.0009809027899884378, + "loss": 0.87454414, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.25036621, + "step": 601, + "time_per_iteration": 2.960700273513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160833, + "balance_loss_mlp": 1.13693142, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.1190375758971125, + "language_loss": 0.88418448, + "learning_rate": 0.0009808174170770779, + "loss": 0.89579284, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.2388916, + "step": 602, + "time_per_iteration": 2.8176493644714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012435, + "balance_loss_mlp": 0.99622273, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.011178693541089954, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85910678, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.16210938, + "step": 603, + "time_per_iteration": 4.909565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_mlp": 1.24103987, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.08512052059651275, + "language_loss": 0.93440074, + "learning_rate": 0.0009806461112779462, + "loss": 0.94705629, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.24511719, + "step": 604, + "time_per_iteration": 2.658644199371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134615, + "balance_loss_mlp": 1.3188746, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.21802356099424494, + "language_loss": 0.87949467, + "learning_rate": 0.0009805601784566814, + "loss": 0.89295614, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.27294922, + "step": 605, + "time_per_iteration": 2.5276598930358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334827, + "balance_loss_mlp": 1.30897105, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.1053210941194693, + "language_loss": 0.95447874, + "learning_rate": 0.0009804740590654089, + "loss": 0.96782702, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.25854492, + "step": 606, + "time_per_iteration": 2.6621856689453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237167, + "balance_loss_mlp": 1.2128365, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09607271254678196, + "language_loss": 0.89416385, + "learning_rate": 0.0009803877531375635, + "loss": 0.90653551, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.2434082, + "step": 607, + "time_per_iteration": 2.8813462257385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219293, + "balance_loss_mlp": 1.19459295, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.08760560664793143, + "language_loss": 0.90707058, + "learning_rate": 0.0009803012607066523, + "loss": 0.91926354, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.24707031, + "step": 608, + "time_per_iteration": 2.7780392169952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185589, + "balance_loss_mlp": 1.16223621, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.10290817733218703, + "language_loss": 0.89330381, + "learning_rate": 0.0009802145818062543, + "loss": 0.90515971, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.23339844, + "step": 609, + "time_per_iteration": 2.713611364364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189107, + "balance_loss_mlp": 1.16636157, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.1057697966066493, + "language_loss": 0.91819966, + "learning_rate": 0.0009801277164700212, + "loss": 0.93009067, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.22741699, + "step": 610, + "time_per_iteration": 2.575333595275879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207403, + "balance_loss_mlp": 1.18378794, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.09616788336185009, + "language_loss": 0.89864278, + "learning_rate": 0.0009800406647316776, + "loss": 0.91071677, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.23608398, + "step": 611, + "time_per_iteration": 2.831953287124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156372, + "balance_loss_mlp": 1.14006376, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.06675579160113412, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78070831, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.16308594, + "step": 612, + "time_per_iteration": 4.820984840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252443, + "balance_loss_mlp": 1.22860086, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.12351306502077156, + "language_loss": 0.8851943, + "learning_rate": 0.000979866002183916, + "loss": 0.89771867, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.23815918, + "step": 613, + "time_per_iteration": 2.6552364826202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233257, + "balance_loss_mlp": 1.20900965, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.09504576379881025, + "language_loss": 0.8953172, + "learning_rate": 0.0009797783914423082, + "loss": 0.90764976, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.24243164, + "step": 614, + "time_per_iteration": 2.8509650230407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120351, + "balance_loss_mlp": 1.18043077, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09364161863028009, + "language_loss": 0.8453747, + "learning_rate": 0.0009796905944342094, + "loss": 0.85740978, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.23071289, + "step": 615, + "time_per_iteration": 2.8491313457489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204695, + "balance_loss_mlp": 1.18137729, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.07677248067083364, + "language_loss": 0.88213146, + "learning_rate": 0.0009796026111937057, + "loss": 0.89417839, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.2331543, + "step": 616, + "time_per_iteration": 2.601902484893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165796, + "balance_loss_mlp": 1.14331329, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.0938738615494663, + "language_loss": 0.88620937, + "learning_rate": 0.0009795144417549552, + "loss": 0.89786732, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.22473145, + "step": 617, + "time_per_iteration": 2.7134363651275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168398, + "balance_loss_mlp": 1.14661872, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.10272804913481705, + "language_loss": 0.89757544, + "learning_rate": 0.0009794260861521883, + "loss": 0.90925944, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.21801758, + "step": 618, + "time_per_iteration": 2.831108331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156602, + "balance_loss_mlp": 1.13393998, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.1607893611237687, + "language_loss": 0.87325203, + "learning_rate": 0.0009793375444197075, + "loss": 0.88481802, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.2265625, + "step": 619, + "time_per_iteration": 2.6383235454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174094, + "balance_loss_mlp": 1.15122962, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.10347254391959168, + "language_loss": 0.85134327, + "learning_rate": 0.000979248816591888, + "loss": 0.8630842, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.22875977, + "step": 620, + "time_per_iteration": 2.7817084789276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.16314173, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.09880033160570031, + "language_loss": 0.85983694, + "learning_rate": 0.0009791599027031766, + "loss": 0.87169874, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.23010254, + "step": 621, + "time_per_iteration": 3.0790488719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202902, + "balance_loss_mlp": 1.17933416, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.0888737424862181, + "language_loss": 0.85755396, + "learning_rate": 0.0009790708027880932, + "loss": 0.86958289, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.2355957, + "step": 622, + "time_per_iteration": 2.839409351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148001, + "balance_loss_mlp": 1.13073957, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.05973140246409555, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78575295, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.17285156, + "step": 623, + "time_per_iteration": 4.827035665512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208498, + "balance_loss_mlp": 1.18456042, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.14072799304068395, + "language_loss": 0.92775166, + "learning_rate": 0.0009788920450172487, + "loss": 0.93983662, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.23925781, + "step": 624, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186727, + "balance_loss_mlp": 1.16287279, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.09148145427830927, + "language_loss": 0.89981961, + "learning_rate": 0.0009788023872308875, + "loss": 0.9116869, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.23852539, + "step": 625, + "time_per_iteration": 2.5552427768707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073346, + "balance_loss_mlp": 1.05656123, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.03421346211042783, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76502347, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.16796875, + "step": 626, + "time_per_iteration": 4.845045804977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152939, + "balance_loss_mlp": 1.12972903, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.16289185985396562, + "language_loss": 0.93840104, + "learning_rate": 0.0009786225140303285, + "loss": 0.94993043, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.23217773, + "step": 627, + "time_per_iteration": 2.697042465209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167636, + "balance_loss_mlp": 1.14417565, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.2209026580633741, + "language_loss": 0.91874695, + "learning_rate": 0.0009785322986859634, + "loss": 0.93042338, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.23461914, + "step": 628, + "time_per_iteration": 2.6944122314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153297, + "balance_loss_mlp": 1.12997985, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.07492335946827373, + "language_loss": 0.92751127, + "learning_rate": 0.0009784418975588838, + "loss": 0.93904424, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.23303223, + "step": 629, + "time_per_iteration": 2.7154979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156607, + "balance_loss_mlp": 1.1338973, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.07449655700779013, + "language_loss": 0.9307186, + "learning_rate": 0.0009783513106841862, + "loss": 0.9422847, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22717285, + "step": 630, + "time_per_iteration": 2.704155921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078203, + "balance_loss_mlp": 1.06208599, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.048222043628353826, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77810907, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.16113281, + "step": 631, + "time_per_iteration": 4.9827399253845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186068, + "balance_loss_mlp": 1.16259575, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.0695579405445101, + "language_loss": 0.87454391, + "learning_rate": 0.0009781695798326854, + "loss": 0.88640457, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.23474121, + "step": 632, + "time_per_iteration": 2.6077868938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119723, + "balance_loss_mlp": 1.17401958, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.0874974071775435, + "language_loss": 0.87916714, + "learning_rate": 0.0009780784359264365, + "loss": 0.89113945, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.23205566, + "step": 633, + "time_per_iteration": 2.6383118629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040346, + "balance_loss_mlp": 1.02403784, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.031225790586482303, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75229043, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.16308594, + "step": 634, + "time_per_iteration": 4.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217333, + "balance_loss_mlp": 1.19409907, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.07796234580729426, + "language_loss": 0.8718015, + "learning_rate": 0.000977895591329867, + "loss": 0.88397485, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.23205566, + "step": 635, + "time_per_iteration": 2.803107976913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234374, + "balance_loss_mlp": 1.21001959, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.11392323325170377, + "language_loss": 0.86567664, + "learning_rate": 0.000977803890710533, + "loss": 0.87802041, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.24304199, + "step": 636, + "time_per_iteration": 2.751648187637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120141, + "balance_loss_mlp": 1.17864108, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.07701221180236865, + "language_loss": 0.93102324, + "learning_rate": 0.0009777120045912774, + "loss": 0.94303727, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.2277832, + "step": 637, + "time_per_iteration": 2.691467761993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186061, + "balance_loss_mlp": 1.16312516, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.08871868954386787, + "language_loss": 0.89725113, + "learning_rate": 0.0009776199330077736, + "loss": 0.90911174, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.22924805, + "step": 638, + "time_per_iteration": 2.7779197692871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117449, + "balance_loss_mlp": 1.15229297, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08051745841053916, + "language_loss": 0.91847914, + "learning_rate": 0.0009775276759957667, + "loss": 0.93022406, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.2220459, + "step": 639, + "time_per_iteration": 2.8452744483947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170739, + "balance_loss_mlp": 1.14792228, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.0993028160053512, + "language_loss": 0.89413661, + "learning_rate": 0.0009774352335910745, + "loss": 0.90584403, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.22814941, + "step": 640, + "time_per_iteration": 2.8268258571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011491, + "balance_loss_mlp": 1.12753499, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08449801570349542, + "language_loss": 0.9440136, + "learning_rate": 0.000977342605829586, + "loss": 0.9555046, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.21569824, + "step": 641, + "time_per_iteration": 2.7570323944091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162286, + "balance_loss_mlp": 1.13913512, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.11072842132379487, + "language_loss": 0.85702711, + "learning_rate": 0.0009772497927472623, + "loss": 0.86864996, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.23144531, + "step": 642, + "time_per_iteration": 3.1265273094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165192, + "balance_loss_mlp": 1.14213657, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.12556940690050455, + "language_loss": 0.84848756, + "learning_rate": 0.0009771567943801368, + "loss": 0.86013943, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.23034668, + "step": 643, + "time_per_iteration": 2.652181386947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160262, + "balance_loss_mlp": 1.13739729, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.08337524575338892, + "language_loss": 0.8885237, + "learning_rate": 0.0009770636107643152, + "loss": 0.90012634, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.22851562, + "step": 644, + "time_per_iteration": 2.7387216091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165702, + "balance_loss_mlp": 1.14195597, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.19339175735102193, + "language_loss": 0.86818463, + "learning_rate": 0.0009769702419359738, + "loss": 0.87984169, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.23730469, + "step": 645, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173541, + "balance_loss_mlp": 1.15027177, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.07743854144019968, + "language_loss": 0.88816965, + "learning_rate": 0.000976876687931362, + "loss": 0.89990509, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.23254395, + "step": 646, + "time_per_iteration": 3.0269463062286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143309, + "balance_loss_mlp": 1.1195029, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.09200303883175577, + "language_loss": 0.84307587, + "learning_rate": 0.0009767829487868005, + "loss": 0.85450894, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.23791504, + "step": 647, + "time_per_iteration": 2.652456045150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136141, + "balance_loss_mlp": 1.11240613, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.0914449303971137, + "language_loss": 0.88396645, + "learning_rate": 0.000976689024538682, + "loss": 0.89532787, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.23718262, + "step": 648, + "time_per_iteration": 2.66267466545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114222, + "balance_loss_mlp": 1.11798477, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.0994157560321478, + "language_loss": 0.86652195, + "learning_rate": 0.0009765949152234716, + "loss": 0.87794411, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.2421875, + "step": 649, + "time_per_iteration": 2.9676578044891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130303, + "balance_loss_mlp": 1.11628377, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.046775068167293626, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79816383, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.140625, + "step": 650, + "time_per_iteration": 4.760566711425781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117959, + "balance_loss_mlp": 1.1559155, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09210474588463947, + "language_loss": 0.813963, + "learning_rate": 0.0009764061415379919, + "loss": 0.82575887, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.2364502, + "step": 651, + "time_per_iteration": 3.3511757850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120244, + "balance_loss_mlp": 1.17746568, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.09212981752556385, + "language_loss": 0.87756586, + "learning_rate": 0.0009763114772410109, + "loss": 0.88959026, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.24975586, + "step": 652, + "time_per_iteration": 2.5980827808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224224, + "balance_loss_mlp": 1.20058513, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.08737716532166849, + "language_loss": 0.86069119, + "learning_rate": 0.0009762166280235146, + "loss": 0.87293345, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.23632812, + "step": 653, + "time_per_iteration": 2.9842958450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232055, + "balance_loss_mlp": 1.2083323, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.10849525216708464, + "language_loss": 0.86920303, + "learning_rate": 0.0009761215939223267, + "loss": 0.88152361, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.23706055, + "step": 654, + "time_per_iteration": 2.741058349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120932, + "balance_loss_mlp": 1.18547845, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.12794458644218995, + "language_loss": 0.85666406, + "learning_rate": 0.0009760263749743428, + "loss": 0.86875725, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.23828125, + "step": 655, + "time_per_iteration": 2.5808663368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180222, + "balance_loss_mlp": 1.15707195, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.095199105706819, + "language_loss": 0.89238775, + "learning_rate": 0.0009759309712165299, + "loss": 0.90418994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.23144531, + "step": 656, + "time_per_iteration": 2.748532295227051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181081, + "balance_loss_mlp": 1.15800261, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.10916020635653645, + "language_loss": 0.9220295, + "learning_rate": 0.0009758353826859272, + "loss": 0.93384039, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.23071289, + "step": 657, + "time_per_iteration": 2.595853805541992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177185, + "balance_loss_mlp": 1.15273547, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.12847037355320456, + "language_loss": 0.87952709, + "learning_rate": 0.0009757396094196456, + "loss": 0.89129901, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.24438477, + "step": 658, + "time_per_iteration": 2.8620266914367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203673, + "balance_loss_mlp": 1.17950892, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.07321655622824354, + "language_loss": 0.83431864, + "learning_rate": 0.0009756436514548673, + "loss": 0.84635538, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.24169922, + "step": 659, + "time_per_iteration": 2.912091016769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217457, + "balance_loss_mlp": 1.19229198, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.10055529179538837, + "language_loss": 0.8726669, + "learning_rate": 0.0009755475088288466, + "loss": 0.88484144, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.25183105, + "step": 660, + "time_per_iteration": 2.781341075897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243827, + "balance_loss_mlp": 1.218292, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.1174457122427187, + "language_loss": 0.88868487, + "learning_rate": 0.0009754511815789095, + "loss": 0.90112311, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.25537109, + "step": 661, + "time_per_iteration": 2.8132684230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246052, + "balance_loss_mlp": 1.21920574, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.09745592985886121, + "language_loss": 0.8455224, + "learning_rate": 0.0009753546697424533, + "loss": 0.85798287, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.26904297, + "step": 662, + "time_per_iteration": 2.7095847129821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243151, + "balance_loss_mlp": 1.21792674, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.12502287201474796, + "language_loss": 0.89571029, + "learning_rate": 0.0009752579733569475, + "loss": 0.90814179, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.25244141, + "step": 663, + "time_per_iteration": 2.6534910202026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119538, + "balance_loss_mlp": 1.17935824, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.048799046747725165, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7607677, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.16015625, + "step": 664, + "time_per_iteration": 4.974175453186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218811, + "balance_loss_mlp": 1.19439721, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.1143586633045421, + "language_loss": 0.88993388, + "learning_rate": 0.0009750640270890217, + "loss": 0.90212196, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.24401855, + "step": 665, + "time_per_iteration": 2.7663521766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124539, + "balance_loss_mlp": 1.22150016, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.11930184932546978, + "language_loss": 0.94833052, + "learning_rate": 0.0009749667772818983, + "loss": 0.96078444, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.23876953, + "step": 666, + "time_per_iteration": 3.01556134223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_mlp": 1.10473776, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.04410313188129877, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78056413, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.15722656, + "step": 667, + "time_per_iteration": 4.865432262420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226701, + "balance_loss_mlp": 1.20370543, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.11041987280926156, + "language_loss": 0.94443977, + "learning_rate": 0.0009747717245101093, + "loss": 0.95670676, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.22998047, + "step": 668, + "time_per_iteration": 2.564667224884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217641, + "balance_loss_mlp": 1.19444275, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0905963820135437, + "language_loss": 0.84166789, + "learning_rate": 0.00097467392162117, + "loss": 0.85384434, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.23193359, + "step": 669, + "time_per_iteration": 2.625565528869629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218894, + "balance_loss_mlp": 1.19641066, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.07707390480747152, + "language_loss": 0.90709603, + "learning_rate": 0.0009745759344474708, + "loss": 0.919285, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.22485352, + "step": 670, + "time_per_iteration": 2.828810691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210198, + "balance_loss_mlp": 1.18807316, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.1296418275033253, + "language_loss": 0.88266867, + "learning_rate": 0.0009744777630270536, + "loss": 0.89477074, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.22119141, + "step": 671, + "time_per_iteration": 2.5931460857391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205012, + "balance_loss_mlp": 1.18351889, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.16263031414063664, + "language_loss": 0.92705458, + "learning_rate": 0.000974379407398032, + "loss": 0.93910474, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.21508789, + "step": 672, + "time_per_iteration": 2.947148323059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208232, + "balance_loss_mlp": 1.18665552, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09135110996657969, + "language_loss": 0.81593442, + "learning_rate": 0.0009742808675985913, + "loss": 0.82801676, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21594238, + "step": 673, + "time_per_iteration": 3.179880380630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223794, + "balance_loss_mlp": 1.20184779, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08798796705409132, + "language_loss": 0.89740491, + "learning_rate": 0.0009741821436669876, + "loss": 0.90964288, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.21948242, + "step": 674, + "time_per_iteration": 2.5925161838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230582, + "balance_loss_mlp": 1.20812273, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.13739173158435178, + "language_loss": 0.91820276, + "learning_rate": 0.0009740832356415492, + "loss": 0.93050855, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22473145, + "step": 675, + "time_per_iteration": 2.5184531211853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223494, + "balance_loss_mlp": 1.20120144, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.10341661200692882, + "language_loss": 0.87010336, + "learning_rate": 0.0009739841435606756, + "loss": 0.88233835, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22290039, + "step": 676, + "time_per_iteration": 3.0507655143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207183, + "balance_loss_mlp": 1.18511748, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.08057490768106465, + "language_loss": 0.89111441, + "learning_rate": 0.0009738848674628377, + "loss": 0.90318626, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.2208252, + "step": 677, + "time_per_iteration": 2.745363235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121617, + "balance_loss_mlp": 1.19430709, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.0856975246411629, + "language_loss": 0.88498092, + "learning_rate": 0.000973785407386578, + "loss": 0.89714259, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.21862793, + "step": 678, + "time_per_iteration": 2.778620958328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214606, + "balance_loss_mlp": 1.191324, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.06828211935324495, + "language_loss": 0.86676407, + "learning_rate": 0.0009736857633705103, + "loss": 0.87891012, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.23266602, + "step": 679, + "time_per_iteration": 2.9231183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209443, + "balance_loss_mlp": 1.18695986, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.0834800111741461, + "language_loss": 0.92100477, + "learning_rate": 0.0009735859354533196, + "loss": 0.93309915, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.22460938, + "step": 680, + "time_per_iteration": 2.775928258895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195854, + "balance_loss_mlp": 1.17248893, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.10927888529927046, + "language_loss": 0.91257143, + "learning_rate": 0.0009734859236737628, + "loss": 0.92453003, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.23339844, + "step": 681, + "time_per_iteration": 2.684873342514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171844, + "balance_loss_mlp": 1.1486578, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.1264690256888091, + "language_loss": 0.92692226, + "learning_rate": 0.0009733857280706678, + "loss": 0.93864071, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.23168945, + "step": 682, + "time_per_iteration": 2.6460657119750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174213, + "balance_loss_mlp": 1.15156293, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.10018713039090629, + "language_loss": 0.83565485, + "learning_rate": 0.000973285348682934, + "loss": 0.84739697, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.2265625, + "step": 683, + "time_per_iteration": 2.758242607116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.13504481, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.05076292773380049, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79046488, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.13085938, + "step": 684, + "time_per_iteration": 4.8192243576049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204502, + "balance_loss_mlp": 1.17974257, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.1066751932733185, + "language_loss": 0.84567851, + "learning_rate": 0.0009730840387095046, + "loss": 0.85772359, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.24768066, + "step": 685, + "time_per_iteration": 3.3115832805633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227082, + "balance_loss_mlp": 1.20198846, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.07078465407426249, + "language_loss": 0.90421009, + "learning_rate": 0.0009729831082019642, + "loss": 0.9164809, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.25097656, + "step": 686, + "time_per_iteration": 2.8678879737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252204, + "balance_loss_mlp": 1.22750425, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.09776828955155538, + "language_loss": 0.8801111, + "learning_rate": 0.0009728819940660958, + "loss": 0.89263314, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.24707031, + "step": 687, + "time_per_iteration": 2.7938969135284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263825, + "balance_loss_mlp": 1.23863578, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.10048379585191887, + "language_loss": 0.84283459, + "learning_rate": 0.0009727806963411557, + "loss": 0.8554728, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.25195312, + "step": 688, + "time_per_iteration": 2.607588529586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239051, + "balance_loss_mlp": 1.2133261, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.08603068006049115, + "language_loss": 0.8672629, + "learning_rate": 0.000972679215066471, + "loss": 0.87965345, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.25756836, + "step": 689, + "time_per_iteration": 2.7422516345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224087, + "balance_loss_mlp": 1.19882667, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.10287089436887557, + "language_loss": 0.9870705, + "learning_rate": 0.0009725775502814401, + "loss": 0.99931133, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.25268555, + "step": 690, + "time_per_iteration": 2.5919952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192013, + "balance_loss_mlp": 1.16732466, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.1091756570575493, + "language_loss": 0.84613961, + "learning_rate": 0.0009724757020255327, + "loss": 0.85805976, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.2467041, + "step": 691, + "time_per_iteration": 2.851348400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011849, + "balance_loss_mlp": 1.15994906, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.0968423296469171, + "language_loss": 0.86866987, + "learning_rate": 0.0009723736703382902, + "loss": 0.88051891, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.24951172, + "step": 692, + "time_per_iteration": 2.5881834030151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179974, + "balance_loss_mlp": 1.15652537, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.10463911515585092, + "language_loss": 0.82742584, + "learning_rate": 0.0009722714552593244, + "loss": 0.83922553, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.23413086, + "step": 693, + "time_per_iteration": 2.6343894004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186752, + "balance_loss_mlp": 1.16344643, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.12210775976205426, + "language_loss": 0.93531036, + "learning_rate": 0.000972169056828319, + "loss": 0.94717789, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.23303223, + "step": 694, + "time_per_iteration": 2.4834342002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183255, + "balance_loss_mlp": 1.16046166, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.08175934073664855, + "language_loss": 0.87263072, + "learning_rate": 0.0009720664750850283, + "loss": 0.88446331, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.22790527, + "step": 695, + "time_per_iteration": 2.796005964279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191523, + "balance_loss_mlp": 1.16836047, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.0918947132133249, + "language_loss": 0.92442453, + "learning_rate": 0.0009719637100692784, + "loss": 0.9363398, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.23168945, + "step": 696, + "time_per_iteration": 2.7338545322418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173282, + "balance_loss_mlp": 1.15093064, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.09425790223988205, + "language_loss": 0.82822204, + "learning_rate": 0.0009718607618209661, + "loss": 0.83995485, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.22351074, + "step": 697, + "time_per_iteration": 2.8834567070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167823, + "balance_loss_mlp": 1.14468443, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07380520807835853, + "language_loss": 0.87331033, + "learning_rate": 0.0009717576303800595, + "loss": 0.88498855, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.23120117, + "step": 698, + "time_per_iteration": 3.0662593841552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189182, + "balance_loss_mlp": 1.1649704, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.08733354578890483, + "language_loss": 0.85059655, + "learning_rate": 0.0009716543157865975, + "loss": 0.86248839, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.24182129, + "step": 699, + "time_per_iteration": 2.7156968116760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210396, + "balance_loss_mlp": 1.1879611, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.08759306221047211, + "language_loss": 0.82954025, + "learning_rate": 0.0009715508180806907, + "loss": 0.84164423, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.22436523, + "step": 700, + "time_per_iteration": 3.204936981201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209609, + "balance_loss_mlp": 1.18669748, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07843453256975112, + "language_loss": 0.89359999, + "learning_rate": 0.0009714471373025202, + "loss": 0.90569609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.22900391, + "step": 701, + "time_per_iteration": 3.4600374698638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186239, + "balance_loss_mlp": 1.16323161, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07505390512906053, + "language_loss": 0.88395512, + "learning_rate": 0.0009713432734923386, + "loss": 0.89581752, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.23010254, + "step": 702, + "time_per_iteration": 2.638005018234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173659, + "balance_loss_mlp": 1.15109301, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09376344684626736, + "language_loss": 0.86520576, + "learning_rate": 0.0009712392266904696, + "loss": 0.8769424, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.22558594, + "step": 703, + "time_per_iteration": 2.7503063678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116976, + "balance_loss_mlp": 1.14838624, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.18430775331568308, + "language_loss": 0.85049546, + "learning_rate": 0.0009711349969373076, + "loss": 0.86219305, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.21386719, + "step": 704, + "time_per_iteration": 3.1815178394317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166202, + "balance_loss_mlp": 1.14376664, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.08099598593900344, + "language_loss": 0.80275941, + "learning_rate": 0.0009710305842733178, + "loss": 0.81442142, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.22436523, + "step": 705, + "time_per_iteration": 2.7307353019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.13138402, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.08979940018627898, + "language_loss": 0.89208561, + "learning_rate": 0.0009709259887390373, + "loss": 0.90360606, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.20666504, + "step": 706, + "time_per_iteration": 2.6135804653167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_mlp": 1.13901603, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.11609486524108804, + "language_loss": 0.9066751, + "learning_rate": 0.0009708212103750737, + "loss": 0.91828114, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.21606445, + "step": 707, + "time_per_iteration": 2.632742166519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185341, + "balance_loss_mlp": 1.16383576, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.10488018026765993, + "language_loss": 0.86886567, + "learning_rate": 0.0009707162492221051, + "loss": 0.88071907, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.21508789, + "step": 708, + "time_per_iteration": 2.9155325889587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221514, + "balance_loss_mlp": 1.19948387, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.11565397704484869, + "language_loss": 0.87553132, + "learning_rate": 0.0009706111053208815, + "loss": 0.88774645, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.22058105, + "step": 709, + "time_per_iteration": 2.843981981277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233371, + "balance_loss_mlp": 1.21016061, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10007182380605975, + "language_loss": 0.85645008, + "learning_rate": 0.0009705057787122232, + "loss": 0.86878371, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.23193359, + "step": 710, + "time_per_iteration": 2.594890832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195501, + "balance_loss_mlp": 1.17281508, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.08836610284298578, + "language_loss": 0.90505099, + "learning_rate": 0.0009704002694370216, + "loss": 0.91700602, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.22680664, + "step": 711, + "time_per_iteration": 2.5702362060546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117145, + "balance_loss_mlp": 1.14863288, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.11670756159189942, + "language_loss": 0.86028767, + "learning_rate": 0.0009702945775362388, + "loss": 0.87200224, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.22802734, + "step": 712, + "time_per_iteration": 2.6679470539093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149984, + "balance_loss_mlp": 1.12776387, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.10271253203083616, + "language_loss": 0.86890107, + "learning_rate": 0.0009701887030509086, + "loss": 0.8804009, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.22229004, + "step": 713, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112596, + "balance_loss_mlp": 1.1041683, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.09375417211048337, + "language_loss": 0.90942538, + "learning_rate": 0.0009700826460221346, + "loss": 0.92068493, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.21801758, + "step": 714, + "time_per_iteration": 2.7277417182922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133315, + "balance_loss_mlp": 1.11104631, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.1250628990201497, + "language_loss": 0.92436254, + "learning_rate": 0.0009699764064910921, + "loss": 0.93569565, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.22265625, + "step": 715, + "time_per_iteration": 2.900053024291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.10697007, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.18348866981846054, + "language_loss": 0.86833155, + "learning_rate": 0.0009698699844990268, + "loss": 0.87962508, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.22387695, + "step": 716, + "time_per_iteration": 2.645792245864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136392, + "balance_loss_mlp": 1.11483872, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.08476879745046602, + "language_loss": 0.87948525, + "learning_rate": 0.0009697633800872555, + "loss": 0.89084923, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.21557617, + "step": 717, + "time_per_iteration": 2.9197771549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153248, + "balance_loss_mlp": 1.13183844, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.08051298122060387, + "language_loss": 0.90472651, + "learning_rate": 0.0009696565932971655, + "loss": 0.91625893, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.2142334, + "step": 718, + "time_per_iteration": 2.9118661880493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157354, + "balance_loss_mlp": 1.1350143, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.09173992406124648, + "language_loss": 0.897349, + "learning_rate": 0.0009695496241702153, + "loss": 0.90892255, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.22338867, + "step": 719, + "time_per_iteration": 2.8108739852905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184496, + "balance_loss_mlp": 1.16145301, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.09716749239115424, + "language_loss": 0.85599422, + "learning_rate": 0.0009694424727479339, + "loss": 0.86783922, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.23034668, + "step": 720, + "time_per_iteration": 2.9078242778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190722, + "balance_loss_mlp": 1.16825104, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.08276113558291018, + "language_loss": 0.88687241, + "learning_rate": 0.0009693351390719213, + "loss": 0.89877963, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.22473145, + "step": 721, + "time_per_iteration": 2.727829933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214778, + "balance_loss_mlp": 1.19178224, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.08055125516722848, + "language_loss": 0.9053812, + "learning_rate": 0.000969227623183848, + "loss": 0.91752893, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.22998047, + "step": 722, + "time_per_iteration": 2.8233954906463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202462, + "balance_loss_mlp": 1.17980003, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06957111358845897, + "language_loss": 0.90902817, + "learning_rate": 0.0009691199251254554, + "loss": 0.92105281, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.22668457, + "step": 723, + "time_per_iteration": 2.838449001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188398, + "balance_loss_mlp": 1.16651106, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.08029384244148012, + "language_loss": 0.86382651, + "learning_rate": 0.0009690120449385555, + "loss": 0.87571049, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.21899414, + "step": 724, + "time_per_iteration": 2.7877347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191299, + "balance_loss_mlp": 1.16917384, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.113442270614728, + "language_loss": 0.92300928, + "learning_rate": 0.0009689039826650312, + "loss": 0.93492222, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.22131348, + "step": 725, + "time_per_iteration": 2.8086507320404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219435, + "balance_loss_mlp": 1.20293677, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.07583456833656638, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77742493, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.16503906, + "step": 726, + "time_per_iteration": 4.891220331192017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177839, + "balance_loss_mlp": 1.15583265, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.4935698294407845, + "language_loss": 0.86680418, + "learning_rate": 0.0009686873120259941, + "loss": 0.8785826, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.22021484, + "step": 727, + "time_per_iteration": 2.584016799926758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220836, + "balance_loss_mlp": 1.19853175, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.12530325225106098, + "language_loss": 0.86788189, + "learning_rate": 0.0009685787037446004, + "loss": 0.88009018, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.22314453, + "step": 728, + "time_per_iteration": 2.7812938690185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256231, + "balance_loss_mlp": 1.2321384, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.19184429152401888, + "language_loss": 0.86789989, + "learning_rate": 0.0009684699135448201, + "loss": 0.88046223, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.24072266, + "step": 729, + "time_per_iteration": 2.7354156970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316519, + "balance_loss_mlp": 1.29105544, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.08142335105414879, + "language_loss": 0.91990757, + "learning_rate": 0.0009683609414688895, + "loss": 0.93307269, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.25463867, + "step": 730, + "time_per_iteration": 2.7542572021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396811, + "balance_loss_mlp": 1.36896372, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.09882218945586521, + "language_loss": 0.86064744, + "learning_rate": 0.0009682517875591154, + "loss": 0.87461555, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.27856445, + "step": 731, + "time_per_iteration": 2.7971835136413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440007, + "balance_loss_mlp": 1.41070533, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.11775580833602758, + "language_loss": 0.85772473, + "learning_rate": 0.0009681424518578749, + "loss": 0.87212479, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.29248047, + "step": 732, + "time_per_iteration": 2.742525100708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460191, + "balance_loss_mlp": 1.43045998, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.14540211876107528, + "language_loss": 0.87523216, + "learning_rate": 0.000968032934407616, + "loss": 0.88983405, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.296875, + "step": 733, + "time_per_iteration": 2.586650848388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01389602, + "balance_loss_mlp": 1.35989547, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.09505428174523772, + "language_loss": 0.81872886, + "learning_rate": 0.0009679232352508571, + "loss": 0.83262491, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.29711914, + "step": 734, + "time_per_iteration": 2.8065295219421387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337262, + "balance_loss_mlp": 1.30776978, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.08594729931011787, + "language_loss": 0.8053807, + "learning_rate": 0.0009678133544301871, + "loss": 0.8187533, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.29492188, + "step": 735, + "time_per_iteration": 2.681156635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_mlp": 1.26231337, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.07917660118228964, + "language_loss": 0.91284931, + "learning_rate": 0.0009677032919882658, + "loss": 0.92575711, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.28442383, + "step": 736, + "time_per_iteration": 2.701876163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_mlp": 1.2393055, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.11161795715290385, + "language_loss": 0.91632634, + "learning_rate": 0.000967593047967823, + "loss": 0.92899764, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.27832031, + "step": 737, + "time_per_iteration": 2.549489736557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257034, + "balance_loss_mlp": 1.22987819, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.11515852654264594, + "language_loss": 0.86905932, + "learning_rate": 0.0009674826224116593, + "loss": 0.88162971, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.27160645, + "step": 738, + "time_per_iteration": 2.8459107875823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01274254, + "balance_loss_mlp": 1.24875474, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.086163857469945, + "language_loss": 0.8627907, + "learning_rate": 0.0009673720153626455, + "loss": 0.87553322, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.25512695, + "step": 739, + "time_per_iteration": 2.6033051013946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01298128, + "balance_loss_mlp": 1.27345169, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07922284002741106, + "language_loss": 0.8672145, + "learning_rate": 0.0009672612268637235, + "loss": 0.88019574, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.2467041, + "step": 740, + "time_per_iteration": 2.639249801635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331294, + "balance_loss_mlp": 1.30575967, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.09083563941739939, + "language_loss": 0.84015429, + "learning_rate": 0.0009671502569579048, + "loss": 0.85346723, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.25537109, + "step": 741, + "time_per_iteration": 2.784358263015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372438, + "balance_loss_mlp": 1.34778547, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.08785871424370759, + "language_loss": 0.89829892, + "learning_rate": 0.0009670391056882719, + "loss": 0.91202337, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.2467041, + "step": 742, + "time_per_iteration": 2.765284299850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384139, + "balance_loss_mlp": 1.35946321, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.09890816943940939, + "language_loss": 0.88263386, + "learning_rate": 0.0009669277730979776, + "loss": 0.89647526, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.24694824, + "step": 743, + "time_per_iteration": 3.2124171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409259, + "balance_loss_mlp": 1.38365269, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.08939291923456745, + "language_loss": 0.85339808, + "learning_rate": 0.0009668162592302449, + "loss": 0.86749065, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.25610352, + "step": 744, + "time_per_iteration": 2.947239398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01413521, + "balance_loss_mlp": 1.38784337, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.12486964956779355, + "language_loss": 0.86141676, + "learning_rate": 0.0009667045641283676, + "loss": 0.87555194, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.25708008, + "step": 745, + "time_per_iteration": 2.67399001121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345232, + "balance_loss_mlp": 1.32049656, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09833561966825685, + "language_loss": 0.94721901, + "learning_rate": 0.0009665926878357092, + "loss": 0.96067131, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.24743652, + "step": 746, + "time_per_iteration": 2.951524257659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308325, + "balance_loss_mlp": 1.28470945, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.09374380516730212, + "language_loss": 0.90804815, + "learning_rate": 0.0009664806303957043, + "loss": 0.92113143, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.23608398, + "step": 747, + "time_per_iteration": 2.7018370628356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290979, + "balance_loss_mlp": 1.26711321, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.09976705309421963, + "language_loss": 0.87274301, + "learning_rate": 0.0009663683918518571, + "loss": 0.88565284, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.23840332, + "step": 748, + "time_per_iteration": 2.9669973850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260109, + "balance_loss_mlp": 1.23742342, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.09601459473111058, + "language_loss": 0.85172814, + "learning_rate": 0.0009662559722477428, + "loss": 0.86432928, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.22680664, + "step": 749, + "time_per_iteration": 2.692737579345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01313989, + "balance_loss_mlp": 1.2952019, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.07630612016334831, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77476966, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.1875, + "step": 750, + "time_per_iteration": 5.012727975845337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203648, + "balance_loss_mlp": 1.18093836, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.10872642357348963, + "language_loss": 0.88863885, + "learning_rate": 0.0009660305900333632, + "loss": 0.90067536, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.22705078, + "step": 751, + "time_per_iteration": 2.715942859649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173383, + "balance_loss_mlp": 1.15045881, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08046883529286915, + "language_loss": 0.82496673, + "learning_rate": 0.0009659176275105992, + "loss": 0.83670056, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.22924805, + "step": 752, + "time_per_iteration": 2.713360071182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.15698361, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.07494959784402849, + "language_loss": 0.85518491, + "learning_rate": 0.0009658044841025701, + "loss": 0.86698937, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.23425293, + "step": 753, + "time_per_iteration": 2.7982797622680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117179, + "balance_loss_mlp": 1.14774585, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.10908868033385523, + "language_loss": 0.81575012, + "learning_rate": 0.0009656911598532021, + "loss": 0.82746804, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.24023438, + "step": 754, + "time_per_iteration": 2.642843246459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216386, + "balance_loss_mlp": 1.19192445, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.08024204468384731, + "language_loss": 0.89968902, + "learning_rate": 0.0009655776548064917, + "loss": 0.91185284, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.24462891, + "step": 755, + "time_per_iteration": 2.6598751544952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240024, + "balance_loss_mlp": 1.2152878, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.0778788297064716, + "language_loss": 0.88022745, + "learning_rate": 0.0009654639690065054, + "loss": 0.89262772, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.24743652, + "step": 756, + "time_per_iteration": 2.8861637115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126465, + "balance_loss_mlp": 1.2393297, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.09020306103656467, + "language_loss": 0.87895447, + "learning_rate": 0.00096535010249738, + "loss": 0.89160097, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.25317383, + "step": 757, + "time_per_iteration": 2.7438864707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270837, + "balance_loss_mlp": 1.24456334, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.12633601395220453, + "language_loss": 0.82038969, + "learning_rate": 0.0009652360553233224, + "loss": 0.83309805, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.26318359, + "step": 758, + "time_per_iteration": 2.7446844577789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210641, + "balance_loss_mlp": 1.18994594, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.05582061662785393, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7498439, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.20703125, + "step": 759, + "time_per_iteration": 4.942702054977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212546, + "balance_loss_mlp": 1.18641555, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.06567012775246582, + "language_loss": 0.81178761, + "learning_rate": 0.0009650074191575883, + "loss": 0.8239131, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.26171875, + "step": 760, + "time_per_iteration": 3.2085912227630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198598, + "balance_loss_mlp": 1.17261064, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07877672537318793, + "language_loss": 0.85659027, + "learning_rate": 0.0009648928302546766, + "loss": 0.86857623, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.26013184, + "step": 761, + "time_per_iteration": 2.7206709384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176065, + "balance_loss_mlp": 1.15095961, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.07899561323852963, + "language_loss": 0.85068321, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244392, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.25109863, + "step": 762, + "time_per_iteration": 3.4438586235046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170525, + "balance_loss_mlp": 1.14620686, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.141987740723005, + "language_loss": 0.87758678, + "learning_rate": 0.0009646631110312001, + "loss": 0.88929206, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.24304199, + "step": 763, + "time_per_iteration": 2.6546902656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152908, + "balance_loss_mlp": 1.12836289, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.07748728130668867, + "language_loss": 0.88344562, + "learning_rate": 0.0009645479807998203, + "loss": 0.89497471, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.2454834, + "step": 764, + "time_per_iteration": 2.7865586280822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149811, + "balance_loss_mlp": 1.12623131, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.07163260805176828, + "language_loss": 0.92376024, + "learning_rate": 0.0009644326702149196, + "loss": 0.93525833, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.23571777, + "step": 765, + "time_per_iteration": 2.729707717895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114598, + "balance_loss_mlp": 1.12176871, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.10016890685610987, + "language_loss": 0.84570462, + "learning_rate": 0.0009643171793212653, + "loss": 0.85716444, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.24206543, + "step": 766, + "time_per_iteration": 3.104130983352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147763, + "balance_loss_mlp": 1.12331319, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.0994818648660217, + "language_loss": 0.88828337, + "learning_rate": 0.0009642015081636952, + "loss": 0.89976102, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.24438477, + "step": 767, + "time_per_iteration": 2.6991779804229736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118291, + "balance_loss_mlp": 1.15871024, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.10983776315132832, + "language_loss": 0.87698913, + "learning_rate": 0.0009640856567871166, + "loss": 0.8888182, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.24182129, + "step": 768, + "time_per_iteration": 2.5240631103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212502, + "balance_loss_mlp": 1.18818331, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07387168528771362, + "language_loss": 0.88451684, + "learning_rate": 0.0009639696252365072, + "loss": 0.89664185, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.24304199, + "step": 769, + "time_per_iteration": 3.0557117462158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239197, + "balance_loss_mlp": 1.21551013, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.09914913961127292, + "language_loss": 0.8159318, + "learning_rate": 0.0009638534135569144, + "loss": 0.82832372, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.23657227, + "step": 770, + "time_per_iteration": 2.9298524856567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245135, + "balance_loss_mlp": 1.22161531, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.09866814803314855, + "language_loss": 0.89646047, + "learning_rate": 0.0009637370217934554, + "loss": 0.90891182, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.23498535, + "step": 771, + "time_per_iteration": 2.682309865951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221897, + "balance_loss_mlp": 1.19855595, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06824551266768007, + "language_loss": 0.83023787, + "learning_rate": 0.0009636204499913175, + "loss": 0.84245688, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.23327637, + "step": 772, + "time_per_iteration": 2.883767604827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223775, + "balance_loss_mlp": 1.20097065, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.07043844896966983, + "language_loss": 0.87725186, + "learning_rate": 0.0009635036981957581, + "loss": 0.88948965, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.22802734, + "step": 773, + "time_per_iteration": 2.9000537395477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187129, + "balance_loss_mlp": 1.16394269, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.15141860037933205, + "language_loss": 0.90646893, + "learning_rate": 0.0009633867664521043, + "loss": 0.91834021, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.23168945, + "step": 774, + "time_per_iteration": 2.8832309246063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169397, + "balance_loss_mlp": 1.14643705, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.08953509264873717, + "language_loss": 0.86451691, + "learning_rate": 0.0009632696548057527, + "loss": 0.87621093, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.22961426, + "step": 775, + "time_per_iteration": 2.5678458213806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114459, + "balance_loss_mlp": 1.12229764, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.10138358829207124, + "language_loss": 0.84634435, + "learning_rate": 0.0009631523633021704, + "loss": 0.85779023, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.22290039, + "step": 776, + "time_per_iteration": 2.8479549884796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_mlp": 1.10418677, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.10363335088159256, + "language_loss": 0.88188493, + "learning_rate": 0.0009630348919868936, + "loss": 0.89315504, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.22814941, + "step": 777, + "time_per_iteration": 2.7757747173309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136966, + "balance_loss_mlp": 1.11441135, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.09986786801472973, + "language_loss": 0.81042939, + "learning_rate": 0.0009629172409055293, + "loss": 0.82179904, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.22558594, + "step": 778, + "time_per_iteration": 2.5126540660858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145221, + "balance_loss_mlp": 1.12336957, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.09261188529780942, + "language_loss": 0.87480628, + "learning_rate": 0.0009627994101037531, + "loss": 0.88625842, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.21875, + "step": 779, + "time_per_iteration": 2.7716262340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115587, + "balance_loss_mlp": 1.13354254, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.08443086809005321, + "language_loss": 0.88840389, + "learning_rate": 0.0009626813996273114, + "loss": 0.8999626, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.22338867, + "step": 780, + "time_per_iteration": 2.8740992546081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186209, + "balance_loss_mlp": 1.16370249, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09833782575281567, + "language_loss": 0.88844621, + "learning_rate": 0.0009625632095220198, + "loss": 0.90030831, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.22497559, + "step": 781, + "time_per_iteration": 2.9050698280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204209, + "balance_loss_mlp": 1.18169069, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1242367807618526, + "language_loss": 0.87087309, + "learning_rate": 0.0009624448398337637, + "loss": 0.88291514, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.22509766, + "step": 782, + "time_per_iteration": 2.5470597743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227287, + "balance_loss_mlp": 1.20476806, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08884420814610612, + "language_loss": 0.8877629, + "learning_rate": 0.0009623262906084984, + "loss": 0.90003586, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.22521973, + "step": 783, + "time_per_iteration": 3.0006895065307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229008, + "balance_loss_mlp": 1.20682311, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.08808618298813263, + "language_loss": 0.8990804, + "learning_rate": 0.0009622075618922486, + "loss": 0.91137052, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.22192383, + "step": 784, + "time_per_iteration": 2.7111520767211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207095, + "balance_loss_mlp": 1.18568492, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.08652833198143661, + "language_loss": 0.87003136, + "learning_rate": 0.0009620886537311091, + "loss": 0.88210225, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.2142334, + "step": 785, + "time_per_iteration": 2.6401422023773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181477, + "balance_loss_mlp": 1.15950704, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.2899950143802249, + "language_loss": 0.85118186, + "learning_rate": 0.000961969566171244, + "loss": 0.8629967, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.21972656, + "step": 786, + "time_per_iteration": 2.526909351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196609, + "balance_loss_mlp": 1.17443573, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08121966250588863, + "language_loss": 0.90082663, + "learning_rate": 0.0009618502992588873, + "loss": 0.91279268, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.22167969, + "step": 787, + "time_per_iteration": 2.6575541496276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230806, + "balance_loss_mlp": 1.20764375, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.0715770490301525, + "language_loss": 0.87907356, + "learning_rate": 0.0009617308530403424, + "loss": 0.89138162, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.23168945, + "step": 788, + "time_per_iteration": 3.028930187225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258523, + "balance_loss_mlp": 1.23478842, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0802298351217653, + "language_loss": 0.87239158, + "learning_rate": 0.0009616112275619825, + "loss": 0.8849768, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.23718262, + "step": 789, + "time_per_iteration": 2.746056079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132077, + "balance_loss_mlp": 1.29635596, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.12648829262821384, + "language_loss": 0.83592963, + "learning_rate": 0.0009614914228702503, + "loss": 0.84913737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.24414062, + "step": 790, + "time_per_iteration": 2.6734559535980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308065, + "balance_loss_mlp": 1.28415179, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.09276885660597874, + "language_loss": 0.89010954, + "learning_rate": 0.0009613714390116581, + "loss": 0.9031902, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.23901367, + "step": 791, + "time_per_iteration": 2.983484983444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285002, + "balance_loss_mlp": 1.26071882, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07985140077311874, + "language_loss": 0.85613286, + "learning_rate": 0.0009612512760327879, + "loss": 0.86898291, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.24291992, + "step": 792, + "time_per_iteration": 2.883850336074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244556, + "balance_loss_mlp": 1.21998703, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09831690791880561, + "language_loss": 0.84491324, + "learning_rate": 0.0009611309339802909, + "loss": 0.85735881, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.24560547, + "step": 793, + "time_per_iteration": 2.4435439109802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207721, + "balance_loss_mlp": 1.1844871, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.0855298606279622, + "language_loss": 0.83781004, + "learning_rate": 0.0009610104129008881, + "loss": 0.84988725, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.23205566, + "step": 794, + "time_per_iteration": 3.13722825050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196195, + "balance_loss_mlp": 1.17304444, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.09863604959388503, + "language_loss": 0.88015008, + "learning_rate": 0.0009608897128413701, + "loss": 0.89211196, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.23132324, + "step": 795, + "time_per_iteration": 2.746291160583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176373, + "balance_loss_mlp": 1.15306783, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.059228494387600535, + "language_loss": 0.85641718, + "learning_rate": 0.0009607688338485965, + "loss": 0.86818099, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.23278809, + "step": 796, + "time_per_iteration": 2.8617959022521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152933, + "balance_loss_mlp": 1.12994909, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.11279362274359876, + "language_loss": 0.90298712, + "learning_rate": 0.0009606477759694969, + "loss": 0.91451651, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.22998047, + "step": 797, + "time_per_iteration": 3.054548978805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147506, + "balance_loss_mlp": 1.12495136, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.1240450491743707, + "language_loss": 0.87260056, + "learning_rate": 0.0009605265392510703, + "loss": 0.88407564, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.22546387, + "step": 798, + "time_per_iteration": 2.660917282104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164418, + "balance_loss_mlp": 1.14092219, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.07786555450456673, + "language_loss": 0.91656721, + "learning_rate": 0.0009604051237403846, + "loss": 0.92821133, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.23474121, + "step": 799, + "time_per_iteration": 2.6837708950042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189347, + "balance_loss_mlp": 1.16668534, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.09844042951466975, + "language_loss": 0.85933173, + "learning_rate": 0.0009602835294845776, + "loss": 0.87122524, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.2265625, + "step": 800, + "time_per_iteration": 2.4643006324768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201717, + "balance_loss_mlp": 1.17804241, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.08383413994751185, + "language_loss": 0.90000272, + "learning_rate": 0.0009601617565308565, + "loss": 0.91201991, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.23681641, + "step": 801, + "time_per_iteration": 2.6335196495056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211967, + "balance_loss_mlp": 1.18856657, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.11945007862770202, + "language_loss": 0.86351627, + "learning_rate": 0.0009600398049264977, + "loss": 0.87563592, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.23413086, + "step": 802, + "time_per_iteration": 3.0110597610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188433, + "balance_loss_mlp": 1.16469824, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.08697800210878956, + "language_loss": 0.9162643, + "learning_rate": 0.0009599176747188469, + "loss": 0.92814863, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.23718262, + "step": 803, + "time_per_iteration": 2.828881025314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169648, + "balance_loss_mlp": 1.14716554, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.09755966571473051, + "language_loss": 0.82901067, + "learning_rate": 0.0009597953659553196, + "loss": 0.84070712, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.22485352, + "step": 804, + "time_per_iteration": 2.744241952896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163065, + "balance_loss_mlp": 1.14110649, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08461871579014175, + "language_loss": 0.8877238, + "learning_rate": 0.0009596728786833997, + "loss": 0.89935452, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.21960449, + "step": 805, + "time_per_iteration": 2.637615203857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153079, + "balance_loss_mlp": 1.13075089, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.07567223700797457, + "language_loss": 0.89263672, + "learning_rate": 0.0009595502129506415, + "loss": 0.90416753, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.2232666, + "step": 806, + "time_per_iteration": 3.381657838821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157381, + "balance_loss_mlp": 1.13502955, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.08260081287011234, + "language_loss": 0.82411599, + "learning_rate": 0.0009594273688046678, + "loss": 0.8356899, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.22351074, + "step": 807, + "time_per_iteration": 2.7444403171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135063, + "balance_loss_mlp": 1.11292577, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.12637371348528909, + "language_loss": 0.85436296, + "learning_rate": 0.000959304346293171, + "loss": 0.8657136, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.22155762, + "step": 808, + "time_per_iteration": 2.630800485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138798, + "balance_loss_mlp": 1.11732841, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.1222248699411619, + "language_loss": 0.87775064, + "learning_rate": 0.0009591811454639125, + "loss": 0.8891387, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.21484375, + "step": 809, + "time_per_iteration": 2.7841880321502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140586, + "balance_loss_mlp": 1.11836529, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.0775612296994351, + "language_loss": 0.87793982, + "learning_rate": 0.0009590577663647234, + "loss": 0.88934565, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.22216797, + "step": 810, + "time_per_iteration": 2.7182021141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171751, + "balance_loss_mlp": 1.14905357, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.0958777530802899, + "language_loss": 0.85768712, + "learning_rate": 0.0009589342090435036, + "loss": 0.86940467, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.22692871, + "step": 811, + "time_per_iteration": 2.794064521789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186257, + "balance_loss_mlp": 1.16242695, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07937656910484732, + "language_loss": 0.86963636, + "learning_rate": 0.0009588104735482223, + "loss": 0.88149893, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.23803711, + "step": 812, + "time_per_iteration": 2.7221293449401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208738, + "balance_loss_mlp": 1.18419302, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.1117655096069856, + "language_loss": 0.83743179, + "learning_rate": 0.0009586865599269177, + "loss": 0.84951913, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.24536133, + "step": 813, + "time_per_iteration": 2.690633773803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238315, + "balance_loss_mlp": 1.21402001, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.10590050341373854, + "language_loss": 0.8774755, + "learning_rate": 0.0009585624682276977, + "loss": 0.88985866, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.24291992, + "step": 814, + "time_per_iteration": 2.756228446960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269022, + "balance_loss_mlp": 1.24407113, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.10996897761132594, + "language_loss": 0.87169892, + "learning_rate": 0.0009584381984987386, + "loss": 0.88438916, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.24938965, + "step": 815, + "time_per_iteration": 2.554874897003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264413, + "balance_loss_mlp": 1.23899746, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.08063052755401852, + "language_loss": 0.89821672, + "learning_rate": 0.0009583137507882864, + "loss": 0.91086084, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.25415039, + "step": 816, + "time_per_iteration": 2.667743444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249115, + "balance_loss_mlp": 1.22435474, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.09885575067946582, + "language_loss": 0.80580056, + "learning_rate": 0.000958189125144656, + "loss": 0.81829166, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.24768066, + "step": 817, + "time_per_iteration": 2.727062463760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.21099687, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.07125848643416562, + "language_loss": 0.88058704, + "learning_rate": 0.0009580643216162313, + "loss": 0.89293534, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.23803711, + "step": 818, + "time_per_iteration": 2.7225098609924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207, + "balance_loss_mlp": 1.18336058, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.1140894572848919, + "language_loss": 0.79018641, + "learning_rate": 0.0009579393402514652, + "loss": 0.80225646, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.23608398, + "step": 819, + "time_per_iteration": 2.623739004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174289, + "balance_loss_mlp": 1.15172231, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.099553940880187, + "language_loss": 0.90219855, + "learning_rate": 0.0009578141810988801, + "loss": 0.9139415, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.22546387, + "step": 820, + "time_per_iteration": 2.6413519382476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.13443768, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07166699024259414, + "language_loss": 0.90092921, + "learning_rate": 0.0009576888442070668, + "loss": 0.91250455, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.23095703, + "step": 821, + "time_per_iteration": 2.586008310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114947, + "balance_loss_mlp": 1.12679601, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.12314887338256089, + "language_loss": 0.91971326, + "learning_rate": 0.0009575633296246854, + "loss": 0.93120795, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.22668457, + "step": 822, + "time_per_iteration": 2.582914113998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153752, + "balance_loss_mlp": 1.13104272, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.08930039023036396, + "language_loss": 0.83068377, + "learning_rate": 0.0009574376374004652, + "loss": 0.84222132, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.22692871, + "step": 823, + "time_per_iteration": 2.689706563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174252, + "balance_loss_mlp": 1.15108991, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.08166713358237257, + "language_loss": 0.80265462, + "learning_rate": 0.000957311767583204, + "loss": 0.81439716, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.23156738, + "step": 824, + "time_per_iteration": 2.5872888565063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134272, + "balance_loss_mlp": 1.11863208, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.027722115426624477, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83205861, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.15625, + "step": 825, + "time_per_iteration": 4.749661445617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186754, + "balance_loss_mlp": 1.16332912, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0939924469385621, + "language_loss": 0.91145539, + "learning_rate": 0.0009570594953650961, + "loss": 0.92332292, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.23425293, + "step": 826, + "time_per_iteration": 2.5129754543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211327, + "balance_loss_mlp": 1.1879499, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.08032409834180723, + "language_loss": 0.80093443, + "learning_rate": 0.00095693309306219, + "loss": 0.81304777, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.23364258, + "step": 827, + "time_per_iteration": 3.116727352142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203567, + "balance_loss_mlp": 1.17957044, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.07716345894173686, + "language_loss": 0.87652111, + "learning_rate": 0.0009568065133621244, + "loss": 0.88855684, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.23986816, + "step": 828, + "time_per_iteration": 3.3514394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186554, + "balance_loss_mlp": 1.1635462, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.09010150887645839, + "language_loss": 0.84615266, + "learning_rate": 0.0009566797563140422, + "loss": 0.85801816, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.23022461, + "step": 829, + "time_per_iteration": 2.8772377967834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178682, + "balance_loss_mlp": 1.15541196, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.07629618570457763, + "language_loss": 0.87662935, + "learning_rate": 0.0009565528219671547, + "loss": 0.88841611, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.23266602, + "step": 830, + "time_per_iteration": 2.9242594242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168807, + "balance_loss_mlp": 1.14639533, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07916714158721186, + "language_loss": 0.84442008, + "learning_rate": 0.0009564257103707418, + "loss": 0.85610813, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.22424316, + "step": 831, + "time_per_iteration": 2.615751266479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115633, + "balance_loss_mlp": 1.13395441, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.07401424691307211, + "language_loss": 0.9042899, + "learning_rate": 0.0009562984215741533, + "loss": 0.91585314, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.22387695, + "step": 832, + "time_per_iteration": 2.666475296020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143834, + "balance_loss_mlp": 1.12204242, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.07498028486943187, + "language_loss": 0.82129556, + "learning_rate": 0.0009561709556268065, + "loss": 0.83273387, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.21801758, + "step": 833, + "time_per_iteration": 2.757997512817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139242, + "balance_loss_mlp": 1.11768937, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.09759486121205484, + "language_loss": 0.94624776, + "learning_rate": 0.0009560433125781884, + "loss": 0.95764017, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.21569824, + "step": 834, + "time_per_iteration": 2.7897424697875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141895, + "balance_loss_mlp": 1.12007987, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.11927807309460302, + "language_loss": 0.92270857, + "learning_rate": 0.0009559154924778544, + "loss": 0.93412757, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.21838379, + "step": 835, + "time_per_iteration": 2.7300117015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146944, + "balance_loss_mlp": 1.12510526, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.08296253434867956, + "language_loss": 0.85007012, + "learning_rate": 0.0009557874953754284, + "loss": 0.8615396, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.21862793, + "step": 836, + "time_per_iteration": 3.0692667961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171621, + "balance_loss_mlp": 1.15024722, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08629072980134203, + "language_loss": 0.83071995, + "learning_rate": 0.0009556593213206038, + "loss": 0.84243613, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.21374512, + "step": 837, + "time_per_iteration": 2.762371778488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198448, + "balance_loss_mlp": 1.17696667, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.07520594985182873, + "language_loss": 0.8681106, + "learning_rate": 0.0009555309703631414, + "loss": 0.88009512, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.21484375, + "step": 838, + "time_per_iteration": 2.721184253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216338, + "balance_loss_mlp": 1.19352138, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.08529357587841585, + "language_loss": 0.87116075, + "learning_rate": 0.0009554024425528722, + "loss": 0.88332415, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.22802734, + "step": 839, + "time_per_iteration": 2.7104406356811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223488, + "balance_loss_mlp": 1.20211315, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.09500040264705899, + "language_loss": 0.88661861, + "learning_rate": 0.0009552737379396948, + "loss": 0.89885342, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.21386719, + "step": 840, + "time_per_iteration": 2.6247448921203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214716, + "balance_loss_mlp": 1.19292414, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06615948862952871, + "language_loss": 0.87843263, + "learning_rate": 0.0009551448565735767, + "loss": 0.8905797, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.21826172, + "step": 841, + "time_per_iteration": 2.8262698650360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211691, + "balance_loss_mlp": 1.19057953, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.09887794790206932, + "language_loss": 0.8426103, + "learning_rate": 0.0009550157985045543, + "loss": 0.85472721, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.21130371, + "step": 842, + "time_per_iteration": 3.0120604038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206189, + "balance_loss_mlp": 1.18486238, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.08797554821911514, + "language_loss": 0.88739967, + "learning_rate": 0.0009548865637827321, + "loss": 0.89946151, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.21337891, + "step": 843, + "time_per_iteration": 2.6481337547302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204978, + "balance_loss_mlp": 1.18372297, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.09077498619786414, + "language_loss": 0.89573538, + "learning_rate": 0.0009547571524582838, + "loss": 0.90778512, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.21264648, + "step": 844, + "time_per_iteration": 2.5942928791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183078, + "balance_loss_mlp": 1.16156065, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.0818153207807116, + "language_loss": 0.92094475, + "learning_rate": 0.0009546275645814512, + "loss": 0.93277556, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.21533203, + "step": 845, + "time_per_iteration": 2.6533596515655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183489, + "balance_loss_mlp": 1.16250849, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.09434310518898727, + "language_loss": 0.89099437, + "learning_rate": 0.0009544978002025446, + "loss": 0.90282923, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.20983887, + "step": 846, + "time_per_iteration": 2.595737934112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174812, + "balance_loss_mlp": 1.15389085, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.0786790126962769, + "language_loss": 0.86643338, + "learning_rate": 0.0009543678593719434, + "loss": 0.87818146, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.20922852, + "step": 847, + "time_per_iteration": 2.734328508377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172317, + "balance_loss_mlp": 1.1513598, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.07855126038492752, + "language_loss": 0.87300336, + "learning_rate": 0.0009542377421400945, + "loss": 0.88472658, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.2097168, + "step": 848, + "time_per_iteration": 2.8172829151153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168862, + "balance_loss_mlp": 1.14789319, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06818105137358721, + "language_loss": 0.83380383, + "learning_rate": 0.0009541074485575145, + "loss": 0.84549248, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.20983887, + "step": 849, + "time_per_iteration": 2.7554948329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153029, + "balance_loss_mlp": 1.13229823, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07075228162905194, + "language_loss": 0.91935623, + "learning_rate": 0.0009539769786747874, + "loss": 0.93088651, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.20739746, + "step": 850, + "time_per_iteration": 2.681631326675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150714, + "balance_loss_mlp": 1.13010252, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.07677284982742894, + "language_loss": 0.80944598, + "learning_rate": 0.0009538463325425665, + "loss": 0.82095313, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.20617676, + "step": 851, + "time_per_iteration": 2.735233783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154047, + "balance_loss_mlp": 1.13384068, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.11739032058616317, + "language_loss": 0.85686159, + "learning_rate": 0.0009537155102115728, + "loss": 0.86840206, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.20202637, + "step": 852, + "time_per_iteration": 2.620140790939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130528, + "balance_loss_mlp": 1.11065602, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.10634949324965158, + "language_loss": 0.83208728, + "learning_rate": 0.0009535845117325961, + "loss": 0.84339261, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.1986084, + "step": 853, + "time_per_iteration": 2.664644241333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137711, + "balance_loss_mlp": 1.11726654, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.07583670741084705, + "language_loss": 0.9317174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94309455, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.20446777, + "step": 854, + "time_per_iteration": 2.801784038543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132055, + "balance_loss_mlp": 1.11093068, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.10901038327062007, + "language_loss": 0.88220453, + "learning_rate": 0.0009533219865341949, + "loss": 0.89352506, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.21130371, + "step": 855, + "time_per_iteration": 2.5974481105804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145642, + "balance_loss_mlp": 1.12525666, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.08694797679629615, + "language_loss": 0.86617303, + "learning_rate": 0.0009531904599166916, + "loss": 0.87762946, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.20385742, + "step": 856, + "time_per_iteration": 2.6515426635742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165234, + "balance_loss_mlp": 1.1438601, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.10972732987763288, + "language_loss": 0.84639692, + "learning_rate": 0.0009530587573550478, + "loss": 0.85804921, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.21374512, + "step": 857, + "time_per_iteration": 2.5966737270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141097, + "balance_loss_mlp": 1.1243124, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04856663639913232, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75460482, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16796875, + "step": 858, + "time_per_iteration": 5.004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122785, + "balance_loss_mlp": 1.20751262, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.09065075677374754, + "language_loss": 0.89923048, + "learning_rate": 0.0009527948246039337, + "loss": 0.91150904, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.20336914, + "step": 859, + "time_per_iteration": 2.5762951374053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250043, + "balance_loss_mlp": 1.22891951, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.10611361403402562, + "language_loss": 0.87094891, + "learning_rate": 0.000952662594516931, + "loss": 0.88344932, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.21130371, + "step": 860, + "time_per_iteration": 3.1250970363616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211235, + "balance_loss_mlp": 1.19042134, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.07567437441181586, + "language_loss": 0.86383927, + "learning_rate": 0.0009525301886907234, + "loss": 0.87595159, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.20812988, + "step": 861, + "time_per_iteration": 2.8821423053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119851, + "balance_loss_mlp": 1.17725468, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.09117738037536942, + "language_loss": 0.87712085, + "learning_rate": 0.0009523976071767155, + "loss": 0.88910592, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.21252441, + "step": 862, + "time_per_iteration": 2.7509195804595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164214, + "balance_loss_mlp": 1.14342415, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08626936460480303, + "language_loss": 0.87840152, + "learning_rate": 0.00095226485002638, + "loss": 0.89004362, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.20800781, + "step": 863, + "time_per_iteration": 2.835188150405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148836, + "balance_loss_mlp": 1.12823641, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.09501605355258884, + "language_loss": 0.88929522, + "learning_rate": 0.0009521319172912576, + "loss": 0.90078366, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.20605469, + "step": 864, + "time_per_iteration": 2.773681879043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148606, + "balance_loss_mlp": 1.12822115, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.1262431233900787, + "language_loss": 0.94519138, + "learning_rate": 0.0009519988090229579, + "loss": 0.95667744, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.20385742, + "step": 865, + "time_per_iteration": 2.7055397033691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134112, + "balance_loss_mlp": 1.11327457, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.10486068908473449, + "language_loss": 0.87655658, + "learning_rate": 0.0009518655252731576, + "loss": 0.88789773, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.20849609, + "step": 866, + "time_per_iteration": 2.774974822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124242, + "balance_loss_mlp": 1.102844, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.09006182482731041, + "language_loss": 0.90070617, + "learning_rate": 0.0009517320660936022, + "loss": 0.91194862, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.2142334, + "step": 867, + "time_per_iteration": 2.7388041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126447, + "balance_loss_mlp": 1.1068728, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.09548967470897408, + "language_loss": 0.82877147, + "learning_rate": 0.0009515984315361051, + "loss": 0.84003592, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.19555664, + "step": 868, + "time_per_iteration": 2.822772264480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.11205709, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.10934486098426227, + "language_loss": 0.86598766, + "learning_rate": 0.000951464621652548, + "loss": 0.87731194, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.20373535, + "step": 869, + "time_per_iteration": 2.648505687713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159804, + "balance_loss_mlp": 1.13964605, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.11951376597850719, + "language_loss": 0.7861675, + "learning_rate": 0.0009513306364948804, + "loss": 0.79776561, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.20153809, + "step": 870, + "time_per_iteration": 2.781686305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188864, + "balance_loss_mlp": 1.16833639, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09165243347067362, + "language_loss": 0.88987041, + "learning_rate": 0.0009511964761151197, + "loss": 0.90175903, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.2052002, + "step": 871, + "time_per_iteration": 2.5691447257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122616, + "balance_loss_mlp": 1.20546532, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.09901550717192838, + "language_loss": 0.90224719, + "learning_rate": 0.0009510621405653521, + "loss": 0.91450876, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.20690918, + "step": 872, + "time_per_iteration": 2.585707426071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191078, + "balance_loss_mlp": 1.17049098, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.11167023861469132, + "language_loss": 0.83886391, + "learning_rate": 0.0009509276298977309, + "loss": 0.85077471, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.20581055, + "step": 873, + "time_per_iteration": 2.970672607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177127, + "balance_loss_mlp": 1.15688562, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09073459995989616, + "language_loss": 0.81845176, + "learning_rate": 0.0009507929441644778, + "loss": 0.83022296, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.20239258, + "step": 874, + "time_per_iteration": 3.5511813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137436, + "balance_loss_mlp": 1.11694419, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09068306382456774, + "language_loss": 0.85649496, + "learning_rate": 0.0009506580834178826, + "loss": 0.86786938, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.20495605, + "step": 875, + "time_per_iteration": 2.797485589981079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130428, + "balance_loss_mlp": 1.10986471, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.09154973704283995, + "language_loss": 0.91347295, + "learning_rate": 0.0009505230477103028, + "loss": 0.92477721, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.20568848, + "step": 876, + "time_per_iteration": 2.70495867729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145788, + "balance_loss_mlp": 1.12518883, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10157591470828177, + "language_loss": 0.8152402, + "learning_rate": 0.0009503878370941641, + "loss": 0.82669806, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.20593262, + "step": 877, + "time_per_iteration": 2.735748052597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151178, + "balance_loss_mlp": 1.13084054, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.15099055549540594, + "language_loss": 0.88741207, + "learning_rate": 0.0009502524516219595, + "loss": 0.89892387, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.20336914, + "step": 878, + "time_per_iteration": 2.730163812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160201, + "balance_loss_mlp": 1.13942301, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.11693714010182361, + "language_loss": 0.9004457, + "learning_rate": 0.0009501168913462506, + "loss": 0.91204774, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.20788574, + "step": 879, + "time_per_iteration": 2.684805393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088136, + "balance_loss_mlp": 1.07440281, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04309817230007909, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80210066, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.13769531, + "step": 880, + "time_per_iteration": 4.804383039474487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166032, + "balance_loss_mlp": 1.14521825, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08467938058221719, + "language_loss": 0.85053843, + "learning_rate": 0.0009498452465949042, + "loss": 0.86219883, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.20825195, + "step": 881, + "time_per_iteration": 3.276735305786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201291, + "balance_loss_mlp": 1.17981005, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06992657838118156, + "language_loss": 0.91281927, + "learning_rate": 0.0009497091622247285, + "loss": 0.92483222, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.21484375, + "step": 882, + "time_per_iteration": 2.70647931098938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200556, + "balance_loss_mlp": 1.17901504, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.0696336676613267, + "language_loss": 0.93377209, + "learning_rate": 0.0009495729032619723, + "loss": 0.94577771, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.2154541, + "step": 883, + "time_per_iteration": 2.7534360885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227373, + "balance_loss_mlp": 1.20546222, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.08705372199297186, + "language_loss": 0.83726418, + "learning_rate": 0.0009494364697595354, + "loss": 0.84953797, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.21923828, + "step": 884, + "time_per_iteration": 2.9550111293792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242229, + "balance_loss_mlp": 1.22078347, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08532836159387652, + "language_loss": 0.89805126, + "learning_rate": 0.0009492998617703867, + "loss": 0.91047359, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.21472168, + "step": 885, + "time_per_iteration": 2.710296154022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216994, + "balance_loss_mlp": 1.19604921, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.08218136336540412, + "language_loss": 0.87561512, + "learning_rate": 0.0009491630793475619, + "loss": 0.88778508, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.20959473, + "step": 886, + "time_per_iteration": 2.6574454307556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223448, + "balance_loss_mlp": 1.20190716, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.06673733954054763, + "language_loss": 0.85054195, + "learning_rate": 0.0009490261225441643, + "loss": 0.8627764, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.2154541, + "step": 887, + "time_per_iteration": 2.9003562927246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209094, + "balance_loss_mlp": 1.18812537, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07511336927499353, + "language_loss": 0.89910543, + "learning_rate": 0.0009488889914133656, + "loss": 0.91119635, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.2097168, + "step": 888, + "time_per_iteration": 2.9909205436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121642, + "balance_loss_mlp": 1.19492674, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07825003291748035, + "language_loss": 0.88796103, + "learning_rate": 0.0009487516860084047, + "loss": 0.90012527, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.21496582, + "step": 889, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192765, + "balance_loss_mlp": 1.17159319, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.10600638107264272, + "language_loss": 0.88708925, + "learning_rate": 0.0009486142063825884, + "loss": 0.89901692, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.21179199, + "step": 890, + "time_per_iteration": 2.583644390106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212506, + "balance_loss_mlp": 1.19724751, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.09034147523214399, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73638725, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15234375, + "step": 891, + "time_per_iteration": 4.9979774951934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175422, + "balance_loss_mlp": 1.1550256, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.1258166683219009, + "language_loss": 0.89561093, + "learning_rate": 0.0009483387246819542, + "loss": 0.9073652, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.20397949, + "step": 892, + "time_per_iteration": 2.7332327365875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068395, + "balance_loss_mlp": 1.05304134, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03219618488122811, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83353972, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.15332031, + "step": 893, + "time_per_iteration": 4.691076993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142338, + "balance_loss_mlp": 1.12172627, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.0974140714584663, + "language_loss": 0.88822401, + "learning_rate": 0.0009480625467392688, + "loss": 0.89964741, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.20617676, + "step": 894, + "time_per_iteration": 2.646313190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_mlp": 1.02080703, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.032237767215918686, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79031026, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15527344, + "step": 895, + "time_per_iteration": 4.73791241645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134198, + "balance_loss_mlp": 1.11333644, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.07818472841262332, + "language_loss": 0.8733896, + "learning_rate": 0.0009477856729834196, + "loss": 0.88473153, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.20874023, + "step": 896, + "time_per_iteration": 2.7401630878448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132796, + "balance_loss_mlp": 1.11235166, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.07866740874279901, + "language_loss": 0.89730608, + "learning_rate": 0.0009476469753098809, + "loss": 0.90863407, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.20446777, + "step": 897, + "time_per_iteration": 2.7601003646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141178, + "balance_loss_mlp": 1.12072182, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08200394390051394, + "language_loss": 0.86714321, + "learning_rate": 0.0009475081038443738, + "loss": 0.878555, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.20458984, + "step": 898, + "time_per_iteration": 2.621018171310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137375, + "balance_loss_mlp": 1.11602414, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.07995623076613839, + "language_loss": 0.85080326, + "learning_rate": 0.0009473690586408124, + "loss": 0.86217701, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.21374512, + "step": 899, + "time_per_iteration": 2.8553502559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149397, + "balance_loss_mlp": 1.12811828, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08690536389731517, + "language_loss": 0.85954648, + "learning_rate": 0.0009472298397531792, + "loss": 0.87104046, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.21276855, + "step": 900, + "time_per_iteration": 2.7427260875701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.12017393, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.12119389218583115, + "language_loss": 0.86411273, + "learning_rate": 0.0009470904472355235, + "loss": 0.87553239, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.21801758, + "step": 901, + "time_per_iteration": 2.6585657596588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138192, + "balance_loss_mlp": 1.11563718, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08947887393013387, + "language_loss": 0.79425454, + "learning_rate": 0.0009469508811419626, + "loss": 0.80563653, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.22570801, + "step": 902, + "time_per_iteration": 2.725372791290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207563, + "balance_loss_mlp": 1.1882031, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.06736803575768126, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72821391, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.819333553314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138702, + "balance_loss_mlp": 1.1156832, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.10583475939723401, + "language_loss": 0.83563209, + "learning_rate": 0.0009466712284439292, + "loss": 0.84701914, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.23022461, + "step": 904, + "time_per_iteration": 2.7723944187164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136442, + "balance_loss_mlp": 1.11426902, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.09911822478323383, + "language_loss": 0.88385195, + "learning_rate": 0.0009465311419480276, + "loss": 0.89521635, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.22180176, + "step": 905, + "time_per_iteration": 2.708866596221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161825, + "balance_loss_mlp": 1.14012873, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.07480170707629828, + "language_loss": 0.88125765, + "learning_rate": 0.0009463908820933622, + "loss": 0.89287591, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.21704102, + "step": 906, + "time_per_iteration": 2.8386967182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165754, + "balance_loss_mlp": 1.1450001, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.09057770875445449, + "language_loss": 0.82559198, + "learning_rate": 0.0009462504489343868, + "loss": 0.83724952, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.20751953, + "step": 907, + "time_per_iteration": 2.8287012577056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182859, + "balance_loss_mlp": 1.16167533, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0967031701007891, + "language_loss": 0.88244259, + "learning_rate": 0.0009461098425256222, + "loss": 0.89427125, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.21203613, + "step": 908, + "time_per_iteration": 2.636411190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184624, + "balance_loss_mlp": 1.16438186, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08569423221876828, + "language_loss": 0.85917675, + "learning_rate": 0.0009459690629216567, + "loss": 0.87102294, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.20239258, + "step": 909, + "time_per_iteration": 2.6774063110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185319, + "balance_loss_mlp": 1.16585207, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06867787211129477, + "language_loss": 0.87373209, + "learning_rate": 0.0009458281101771457, + "loss": 0.88558531, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.19445801, + "step": 910, + "time_per_iteration": 2.6256136894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183744, + "balance_loss_mlp": 1.16421723, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.07423161751862324, + "language_loss": 0.82895565, + "learning_rate": 0.0009456869843468122, + "loss": 0.84079307, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.19519043, + "step": 911, + "time_per_iteration": 2.8429157733917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181199, + "balance_loss_mlp": 1.16098118, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.10560425483963332, + "language_loss": 0.78068089, + "learning_rate": 0.0009455456854854459, + "loss": 0.79249287, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.20214844, + "step": 912, + "time_per_iteration": 2.6220157146453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161811, + "balance_loss_mlp": 1.1425947, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.07427056945907796, + "language_loss": 0.84015787, + "learning_rate": 0.0009454042136479039, + "loss": 0.851776, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.19189453, + "step": 913, + "time_per_iteration": 2.5928330421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170659, + "balance_loss_mlp": 1.15183616, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.08169247609196438, + "language_loss": 0.82556438, + "learning_rate": 0.0009452625688891103, + "loss": 0.83727098, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.18798828, + "step": 914, + "time_per_iteration": 2.5541818141937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215011, + "balance_loss_mlp": 1.20032406, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.06355474766062214, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79949749, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.14648438, + "step": 915, + "time_per_iteration": 4.609099864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151019, + "balance_loss_mlp": 1.13170671, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08978748093655645, + "language_loss": 0.92478371, + "learning_rate": 0.0009449787608278015, + "loss": 0.9362939, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.19299316, + "step": 916, + "time_per_iteration": 2.8081016540527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144026, + "balance_loss_mlp": 1.12480903, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08892608358050076, + "language_loss": 0.9215048, + "learning_rate": 0.0009448365976354704, + "loss": 0.93294501, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.19213867, + "step": 917, + "time_per_iteration": 2.5476417541503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141081, + "balance_loss_mlp": 1.12047005, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.10930526403118525, + "language_loss": 0.89404565, + "learning_rate": 0.0009446942617422558, + "loss": 0.90545642, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.20617676, + "step": 918, + "time_per_iteration": 2.6054670810699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159521, + "balance_loss_mlp": 1.13917232, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.08039502929266268, + "language_loss": 0.84809625, + "learning_rate": 0.0009445517532034176, + "loss": 0.85969138, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.20349121, + "step": 919, + "time_per_iteration": 2.736720561981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116126, + "balance_loss_mlp": 1.14050603, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09960932315337, + "language_loss": 0.88503635, + "learning_rate": 0.0009444090720742824, + "loss": 0.89664894, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.20751953, + "step": 920, + "time_per_iteration": 2.5981345176696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118497, + "balance_loss_mlp": 1.16263032, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.09080242050592086, + "language_loss": 0.87781966, + "learning_rate": 0.0009442662184102439, + "loss": 0.88966942, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.22351074, + "step": 921, + "time_per_iteration": 2.855386972427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195366, + "balance_loss_mlp": 1.17316878, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07657240030806824, + "language_loss": 0.86990869, + "learning_rate": 0.000944123192266763, + "loss": 0.88186234, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.22216797, + "step": 922, + "time_per_iteration": 2.862642526626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184819, + "balance_loss_mlp": 1.16284895, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.09417779830391854, + "language_loss": 0.83500814, + "learning_rate": 0.0009439799936993671, + "loss": 0.8468563, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.21960449, + "step": 923, + "time_per_iteration": 2.7609872817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172194, + "balance_loss_mlp": 1.1505692, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.183012338078167, + "language_loss": 0.87992036, + "learning_rate": 0.0009438366227636511, + "loss": 0.89164221, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.21630859, + "step": 924, + "time_per_iteration": 2.680379867553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147406, + "balance_loss_mlp": 1.12692571, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07052119854018758, + "language_loss": 0.8590064, + "learning_rate": 0.0009436930795152763, + "loss": 0.87048048, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.20483398, + "step": 925, + "time_per_iteration": 2.84305477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134796, + "balance_loss_mlp": 1.11461377, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.10542322310235813, + "language_loss": 0.86425805, + "learning_rate": 0.0009435493640099713, + "loss": 0.875606, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.20178223, + "step": 926, + "time_per_iteration": 2.8326363563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147089, + "balance_loss_mlp": 1.12663293, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.1030160256649362, + "language_loss": 0.83799899, + "learning_rate": 0.0009434054763035314, + "loss": 0.8494699, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.20458984, + "step": 927, + "time_per_iteration": 2.6224582195281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142086, + "balance_loss_mlp": 1.12232113, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0964966031181637, + "language_loss": 0.85150439, + "learning_rate": 0.0009432614164518185, + "loss": 0.86292523, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.19750977, + "step": 928, + "time_per_iteration": 2.989607810974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115354, + "balance_loss_mlp": 1.13345337, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.11261525147662245, + "language_loss": 0.84222531, + "learning_rate": 0.000943117184510762, + "loss": 0.85376072, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.20080566, + "step": 929, + "time_per_iteration": 3.0107991695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167369, + "balance_loss_mlp": 1.15220594, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.0706795740425107, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7995733, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.15136719, + "step": 930, + "time_per_iteration": 5.0069990158081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168853, + "balance_loss_mlp": 1.14890909, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.0722944763131068, + "language_loss": 0.885297, + "learning_rate": 0.0009428282045846674, + "loss": 0.89698553, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.19934082, + "step": 931, + "time_per_iteration": 2.705216884613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173254, + "balance_loss_mlp": 1.15314293, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.06808586729698768, + "language_loss": 0.89063865, + "learning_rate": 0.0009426834567118214, + "loss": 0.90237117, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.2010498, + "step": 932, + "time_per_iteration": 3.1137044429779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179521, + "balance_loss_mlp": 1.16003084, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.07690698304216284, + "language_loss": 0.80337363, + "learning_rate": 0.0009425385369740155, + "loss": 0.81516886, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.19470215, + "step": 933, + "time_per_iteration": 3.0430078506469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186692, + "balance_loss_mlp": 1.16659284, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.10248881334159239, + "language_loss": 0.86684513, + "learning_rate": 0.0009423934454275125, + "loss": 0.87871206, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.20092773, + "step": 934, + "time_per_iteration": 2.888127565383911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171583, + "balance_loss_mlp": 1.15185428, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.08181978587800019, + "language_loss": 0.91464841, + "learning_rate": 0.0009422481821286418, + "loss": 0.92636418, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.19714355, + "step": 935, + "time_per_iteration": 2.725064516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.13605165, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.08977099192722084, + "language_loss": 0.87336344, + "learning_rate": 0.0009421027471337998, + "loss": 0.88491625, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.19213867, + "step": 936, + "time_per_iteration": 2.64992356300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153899, + "balance_loss_mlp": 1.13451552, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.08166389785278784, + "language_loss": 0.82045889, + "learning_rate": 0.0009419571404994493, + "loss": 0.83199793, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.19360352, + "step": 937, + "time_per_iteration": 2.6302027702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140367, + "balance_loss_mlp": 1.12045932, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10573813889003272, + "language_loss": 0.9057107, + "learning_rate": 0.00094181136228212, + "loss": 0.91711438, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.19909668, + "step": 938, + "time_per_iteration": 2.6472811698913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146966, + "balance_loss_mlp": 1.12671292, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.10223057205117164, + "language_loss": 0.85864574, + "learning_rate": 0.0009416654125384077, + "loss": 0.8701154, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.20251465, + "step": 939, + "time_per_iteration": 2.7523345947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100615, + "balance_loss_mlp": 1.08507037, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.03692949506691956, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80872989, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15527344, + "step": 940, + "time_per_iteration": 4.95509147644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139639, + "balance_loss_mlp": 1.1185863, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.07658245982623446, + "language_loss": 0.83579218, + "learning_rate": 0.000941372998698552, + "loss": 0.84718859, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.21057129, + "step": 941, + "time_per_iteration": 3.022993326187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152659, + "balance_loss_mlp": 1.13134432, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.08701506300356623, + "language_loss": 0.81278259, + "learning_rate": 0.0009412265347159336, + "loss": 0.82430923, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.21325684, + "step": 942, + "time_per_iteration": 2.7516462802886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.11446774, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.09990043941217396, + "language_loss": 0.84286022, + "learning_rate": 0.0009410798994339829, + "loss": 0.85422134, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.21655273, + "step": 943, + "time_per_iteration": 2.619678258895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125702, + "balance_loss_mlp": 1.10438752, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.0907244307429491, + "language_loss": 0.87645197, + "learning_rate": 0.000940933092909628, + "loss": 0.88770896, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.21337891, + "step": 944, + "time_per_iteration": 2.5915796756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137514, + "balance_loss_mlp": 1.11566281, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07468252045243974, + "language_loss": 0.8361553, + "learning_rate": 0.0009407861151998649, + "loss": 0.84753042, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.21838379, + "step": 945, + "time_per_iteration": 2.597646713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146698, + "balance_loss_mlp": 1.12490702, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07893028842648955, + "language_loss": 0.85781825, + "learning_rate": 0.0009406389663617552, + "loss": 0.86928523, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.21789551, + "step": 946, + "time_per_iteration": 2.6909499168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157426, + "balance_loss_mlp": 1.1367197, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0883302731715351, + "language_loss": 0.85250366, + "learning_rate": 0.000940491646452427, + "loss": 0.86407793, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.20703125, + "step": 947, + "time_per_iteration": 2.7548892498016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188123, + "balance_loss_mlp": 1.16742826, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.09521178511918296, + "language_loss": 0.9039495, + "learning_rate": 0.000940344155529075, + "loss": 0.91583067, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.20690918, + "step": 948, + "time_per_iteration": 2.6882100105285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214568, + "balance_loss_mlp": 1.19396889, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.12174210826928723, + "language_loss": 0.86923814, + "learning_rate": 0.0009401964936489605, + "loss": 0.88138384, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.20605469, + "step": 949, + "time_per_iteration": 2.5339841842651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199663, + "balance_loss_mlp": 1.18013692, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.0789508013524053, + "language_loss": 0.85218668, + "learning_rate": 0.0009400486608694108, + "loss": 0.86418331, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.19506836, + "step": 950, + "time_per_iteration": 2.7437641620635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173826, + "balance_loss_mlp": 1.15394247, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.08777486633127113, + "language_loss": 0.87155032, + "learning_rate": 0.0009399006572478195, + "loss": 0.88328856, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.19873047, + "step": 951, + "time_per_iteration": 3.1146392822265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151812, + "balance_loss_mlp": 1.1324048, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06965363368279433, + "language_loss": 0.90749818, + "learning_rate": 0.0009397524828416468, + "loss": 0.91901636, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.19384766, + "step": 952, + "time_per_iteration": 2.7005960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150163, + "balance_loss_mlp": 1.13092208, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.08371144384200242, + "language_loss": 0.95721734, + "learning_rate": 0.0009396041377084192, + "loss": 0.96871901, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.19226074, + "step": 953, + "time_per_iteration": 2.65962290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143055, + "balance_loss_mlp": 1.12399304, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07808709569264205, + "language_loss": 0.87208664, + "learning_rate": 0.0009394556219057295, + "loss": 0.88351727, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.19055176, + "step": 954, + "time_per_iteration": 2.7021074295043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146054, + "balance_loss_mlp": 1.12665915, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.0732836103686164, + "language_loss": 0.83296251, + "learning_rate": 0.0009393069354912362, + "loss": 0.84442306, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.19372559, + "step": 955, + "time_per_iteration": 2.7472946643829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146804, + "balance_loss_mlp": 1.12801623, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07466806963668332, + "language_loss": 0.81601501, + "learning_rate": 0.0009391580785226649, + "loss": 0.827483, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.18798828, + "step": 956, + "time_per_iteration": 2.865922212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.07007885, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.04640489893855834, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80424643, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.14160156, + "step": 957, + "time_per_iteration": 4.8100152015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.13656831, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.08641924144795167, + "language_loss": 0.86033231, + "learning_rate": 0.0009388598531545196, + "loss": 0.87189424, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.19604492, + "step": 958, + "time_per_iteration": 2.879993438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162855, + "balance_loss_mlp": 1.14316201, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08295253694800603, + "language_loss": 0.85064113, + "learning_rate": 0.000938710484870727, + "loss": 0.8622697, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.19677734, + "step": 959, + "time_per_iteration": 2.6058270931243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169469, + "balance_loss_mlp": 1.14974046, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0909196929102129, + "language_loss": 0.85416096, + "learning_rate": 0.0009385609462644189, + "loss": 0.86585563, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.19714355, + "step": 960, + "time_per_iteration": 4.22582483291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116162, + "balance_loss_mlp": 1.14138985, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.0839924836083711, + "language_loss": 0.8550421, + "learning_rate": 0.0009384112373936514, + "loss": 0.86665827, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.20227051, + "step": 961, + "time_per_iteration": 2.6566050052642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161461, + "balance_loss_mlp": 1.14142191, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.0943693164808434, + "language_loss": 0.90989888, + "learning_rate": 0.0009382613583165467, + "loss": 0.92151344, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.20031738, + "step": 962, + "time_per_iteration": 2.823707103729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115093, + "balance_loss_mlp": 1.13110566, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07960710886198098, + "language_loss": 0.89083374, + "learning_rate": 0.0009381113090912928, + "loss": 0.90234309, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.19824219, + "step": 963, + "time_per_iteration": 2.760617733001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113196, + "balance_loss_mlp": 1.11194444, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.09269195293936518, + "language_loss": 0.89102614, + "learning_rate": 0.000937961089776144, + "loss": 0.90234572, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.20007324, + "step": 964, + "time_per_iteration": 2.637064218521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137413, + "balance_loss_mlp": 1.11674166, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09284731320409442, + "language_loss": 0.82889503, + "learning_rate": 0.0009378107004294208, + "loss": 0.84026921, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.20678711, + "step": 965, + "time_per_iteration": 2.9863977432250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133143, + "balance_loss_mlp": 1.11312819, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08496740626071231, + "language_loss": 0.90790451, + "learning_rate": 0.0009376601411095096, + "loss": 0.91923594, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.20007324, + "step": 966, + "time_per_iteration": 2.68448543548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118077, + "balance_loss_mlp": 1.09840786, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07860547413279617, + "language_loss": 0.8636961, + "learning_rate": 0.0009375094118748622, + "loss": 0.87487686, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.1965332, + "step": 967, + "time_per_iteration": 2.6023223400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116788, + "balance_loss_mlp": 1.09746408, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.09121906518800267, + "language_loss": 0.90388292, + "learning_rate": 0.0009373585127839976, + "loss": 0.91505075, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.19299316, + "step": 968, + "time_per_iteration": 2.9992241859436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128327, + "balance_loss_mlp": 1.10974205, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08500834593637788, + "language_loss": 0.90474886, + "learning_rate": 0.0009372074438954994, + "loss": 0.91603214, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.18579102, + "step": 969, + "time_per_iteration": 2.6900458335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129744, + "balance_loss_mlp": 1.11119485, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.07463307704621708, + "language_loss": 0.91465181, + "learning_rate": 0.0009370562052680181, + "loss": 0.92594928, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.18554688, + "step": 970, + "time_per_iteration": 2.4830586910247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118559, + "balance_loss_mlp": 1.10014117, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.0879562727670826, + "language_loss": 0.89281493, + "learning_rate": 0.0009369047969602695, + "loss": 0.90400052, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.18432617, + "step": 971, + "time_per_iteration": 2.745058298110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126638, + "balance_loss_mlp": 1.10707593, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.10844584745321367, + "language_loss": 0.862324, + "learning_rate": 0.0009367532190310357, + "loss": 0.87359041, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.19543457, + "step": 972, + "time_per_iteration": 2.6137964725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113488, + "balance_loss_mlp": 1.09404469, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.07658656218276177, + "language_loss": 0.88875228, + "learning_rate": 0.0009366014715391644, + "loss": 0.8998872, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.19433594, + "step": 973, + "time_per_iteration": 2.6654906272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112059, + "balance_loss_mlp": 1.09299731, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.11180851981284076, + "language_loss": 0.83713347, + "learning_rate": 0.0009364495545435693, + "loss": 0.84825402, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.19055176, + "step": 974, + "time_per_iteration": 2.801388740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120051, + "balance_loss_mlp": 1.1004051, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06978545014802194, + "language_loss": 0.87871438, + "learning_rate": 0.0009362974681032297, + "loss": 0.88991487, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.19628906, + "step": 975, + "time_per_iteration": 2.6227941513061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124993, + "balance_loss_mlp": 1.10491848, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.08030171004504767, + "language_loss": 0.88050348, + "learning_rate": 0.0009361452122771907, + "loss": 0.89175344, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.20080566, + "step": 976, + "time_per_iteration": 2.899641752243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139865, + "balance_loss_mlp": 1.1185981, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.09158450212133555, + "language_loss": 0.82837689, + "learning_rate": 0.0009359927871245635, + "loss": 0.8397755, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.21289062, + "step": 977, + "time_per_iteration": 2.5095362663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.12616336, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.08436158367459867, + "language_loss": 0.86086357, + "learning_rate": 0.0009358401927045246, + "loss": 0.8723408, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.21569824, + "step": 978, + "time_per_iteration": 2.880329132080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115214, + "balance_loss_mlp": 1.12937117, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.14896441210102726, + "language_loss": 0.881185, + "learning_rate": 0.0009356874290763166, + "loss": 0.89270639, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.22753906, + "step": 979, + "time_per_iteration": 3.519901990890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146434, + "balance_loss_mlp": 1.12485671, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.08194638070334626, + "language_loss": 0.88670301, + "learning_rate": 0.0009355344962992474, + "loss": 0.89816737, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.21606445, + "step": 980, + "time_per_iteration": 2.638364553451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137899, + "balance_loss_mlp": 1.11571455, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07836652437453029, + "language_loss": 0.8762567, + "learning_rate": 0.0009353813944326908, + "loss": 0.88763571, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.22180176, + "step": 981, + "time_per_iteration": 2.963667869567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131587, + "balance_loss_mlp": 1.10924709, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.08486883897693408, + "language_loss": 0.82728517, + "learning_rate": 0.0009352281235360863, + "loss": 0.83860105, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.22338867, + "step": 982, + "time_per_iteration": 2.752194404602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146713, + "balance_loss_mlp": 1.12631679, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08390803894001939, + "language_loss": 0.84704804, + "learning_rate": 0.0009350746836689389, + "loss": 0.85851514, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.20385742, + "step": 983, + "time_per_iteration": 2.572817325592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.13550532, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06256828552174507, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.8258903, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.14257812, + "step": 984, + "time_per_iteration": 5.0779805183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126626, + "balance_loss_mlp": 1.10678935, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08472556529064418, + "language_loss": 0.82448637, + "learning_rate": 0.0009347672972613634, + "loss": 0.83575261, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.19824219, + "step": 985, + "time_per_iteration": 2.615293502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113053, + "balance_loss_mlp": 1.11202836, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.06995806836739982, + "language_loss": 0.8510493, + "learning_rate": 0.0009346133508402735, + "loss": 0.86235464, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.18469238, + "step": 986, + "time_per_iteration": 2.729766845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145502, + "balance_loss_mlp": 1.12719178, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07783152768123536, + "language_loss": 0.83385336, + "learning_rate": 0.0009344592356873166, + "loss": 0.84530836, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.18322754, + "step": 987, + "time_per_iteration": 2.642298698425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142552, + "balance_loss_mlp": 1.12420571, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.1311760581731783, + "language_loss": 0.78159761, + "learning_rate": 0.0009343049518623255, + "loss": 0.79302317, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.18359375, + "step": 988, + "time_per_iteration": 2.7496607303619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147502, + "balance_loss_mlp": 1.12969208, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07011475213003748, + "language_loss": 0.82941067, + "learning_rate": 0.0009341504994251985, + "loss": 0.8408857, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.17822266, + "step": 989, + "time_per_iteration": 2.850295305252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154172, + "balance_loss_mlp": 1.13986683, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.061552691423840886, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74674672, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.14257812, + "step": 990, + "time_per_iteration": 5.020269393920898 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160468, + "balance_loss_mlp": 1.14208579, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.07610354532645859, + "language_loss": 0.81556082, + "learning_rate": 0.0009338410889544574, + "loss": 0.82716548, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.18383789, + "step": 991, + "time_per_iteration": 3.0640664100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159788, + "balance_loss_mlp": 1.14151347, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07533691574431517, + "language_loss": 0.87469906, + "learning_rate": 0.000933686131040967, + "loss": 0.88629693, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.18273926, + "step": 992, + "time_per_iteration": 2.8369646072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153048, + "balance_loss_mlp": 1.13516688, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.2292689794441624, + "language_loss": 0.90069616, + "learning_rate": 0.0009335310047555883, + "loss": 0.91222656, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.17895508, + "step": 993, + "time_per_iteration": 2.7662436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201823, + "balance_loss_mlp": 1.18303561, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.08969446374481721, + "language_loss": 0.87941462, + "learning_rate": 0.0009333757101585467, + "loss": 0.89143288, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.18786621, + "step": 994, + "time_per_iteration": 2.6766159534454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248094, + "balance_loss_mlp": 1.22967577, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.09684982281817384, + "language_loss": 0.93064606, + "learning_rate": 0.0009332202473101329, + "loss": 0.94312704, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.1842041, + "step": 995, + "time_per_iteration": 2.6848959922790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124761, + "balance_loss_mlp": 1.22866774, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.14945399887744149, + "language_loss": 0.82354605, + "learning_rate": 0.0009330646162707028, + "loss": 0.83602214, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.18933105, + "step": 996, + "time_per_iteration": 2.7672605514526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120105, + "balance_loss_mlp": 1.18239403, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.09345568382872575, + "language_loss": 0.83716351, + "learning_rate": 0.0009329088171006779, + "loss": 0.84917402, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.18664551, + "step": 997, + "time_per_iteration": 3.177269697189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171291, + "balance_loss_mlp": 1.15201521, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09261663839867938, + "language_loss": 0.85307527, + "learning_rate": 0.0009327528498605446, + "loss": 0.86478817, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.19274902, + "step": 998, + "time_per_iteration": 2.5818471908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136553, + "balance_loss_mlp": 1.11700296, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.11232924304021881, + "language_loss": 0.89184988, + "learning_rate": 0.0009325967146108548, + "loss": 0.90321541, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.1953125, + "step": 999, + "time_per_iteration": 2.672342300415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141697, + "balance_loss_mlp": 1.12257588, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.11996696196806446, + "language_loss": 0.87541509, + "learning_rate": 0.0009324404114122258, + "loss": 0.88683212, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.19104004, + "step": 1000, + "time_per_iteration": 2.7652101516723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142189, + "balance_loss_mlp": 1.12290096, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.09563187877453348, + "language_loss": 0.86816871, + "learning_rate": 0.0009322839403253397, + "loss": 0.87959063, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.19274902, + "step": 1001, + "time_per_iteration": 2.7855865955352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113304, + "balance_loss_mlp": 1.11353719, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.0964526780140198, + "language_loss": 0.8374511, + "learning_rate": 0.0009321273014109439, + "loss": 0.84878153, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.19494629, + "step": 1002, + "time_per_iteration": 2.9773457050323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137462, + "balance_loss_mlp": 1.11835289, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.07256968924395192, + "language_loss": 0.8405087, + "learning_rate": 0.0009319704947298513, + "loss": 0.85188329, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.19104004, + "step": 1003, + "time_per_iteration": 2.8997581005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144905, + "balance_loss_mlp": 1.12630868, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.15770574603346119, + "language_loss": 0.88051564, + "learning_rate": 0.0009318135203429393, + "loss": 0.89196467, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.18579102, + "step": 1004, + "time_per_iteration": 4.269490957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156184, + "balance_loss_mlp": 1.13703942, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.08756431218137971, + "language_loss": 0.87512451, + "learning_rate": 0.0009316563783111511, + "loss": 0.88668633, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.19128418, + "step": 1005, + "time_per_iteration": 2.741323471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164816, + "balance_loss_mlp": 1.14583826, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06803118553980413, + "language_loss": 0.81866097, + "learning_rate": 0.0009314990686954943, + "loss": 0.83030909, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.18969727, + "step": 1006, + "time_per_iteration": 2.955195903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198123, + "balance_loss_mlp": 1.1794908, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.08085614110860996, + "language_loss": 0.80862725, + "learning_rate": 0.000931341591557042, + "loss": 0.8206085, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.18615723, + "step": 1007, + "time_per_iteration": 3.74294114112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192787, + "balance_loss_mlp": 1.17408264, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.10092207476563657, + "language_loss": 0.87274837, + "learning_rate": 0.0009311839469569325, + "loss": 0.88467628, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.18701172, + "step": 1008, + "time_per_iteration": 2.7143359184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188299, + "balance_loss_mlp": 1.16947544, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.10252165229611418, + "language_loss": 0.86257041, + "learning_rate": 0.0009310261349563687, + "loss": 0.87445343, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.18823242, + "step": 1009, + "time_per_iteration": 2.7420098781585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156356, + "balance_loss_mlp": 1.13825965, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.06920598095285249, + "language_loss": 0.8520751, + "learning_rate": 0.0009308681556166186, + "loss": 0.86363864, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.18103027, + "step": 1010, + "time_per_iteration": 2.8593883514404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162311, + "balance_loss_mlp": 1.14391661, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10589580567356643, + "language_loss": 0.87318867, + "learning_rate": 0.0009307100089990152, + "loss": 0.88481176, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.18408203, + "step": 1011, + "time_per_iteration": 2.7444002628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144739, + "balance_loss_mlp": 1.12624931, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.10287575048528846, + "language_loss": 0.83773112, + "learning_rate": 0.0009305516951649568, + "loss": 0.84917855, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.18481445, + "step": 1012, + "time_per_iteration": 2.7355475425720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174282, + "balance_loss_mlp": 1.15630519, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07063143065951083, + "language_loss": 0.86586678, + "learning_rate": 0.0009303932141759057, + "loss": 0.87760961, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.17980957, + "step": 1013, + "time_per_iteration": 2.778740882873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166591, + "balance_loss_mlp": 1.14829278, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.09801085242945827, + "language_loss": 0.83495271, + "learning_rate": 0.0009302345660933902, + "loss": 0.84661865, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.18286133, + "step": 1014, + "time_per_iteration": 2.8084325790405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178039, + "balance_loss_mlp": 1.1603483, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.1010340318018862, + "language_loss": 0.84950441, + "learning_rate": 0.0009300757509790026, + "loss": 0.86128479, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.17712402, + "step": 1015, + "time_per_iteration": 2.9023685455322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179228, + "balance_loss_mlp": 1.16137052, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.1305336983537898, + "language_loss": 0.90272522, + "learning_rate": 0.0009299167688944005, + "loss": 0.91451752, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.17883301, + "step": 1016, + "time_per_iteration": 2.5396370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180846, + "balance_loss_mlp": 1.16236818, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.10642959866559894, + "language_loss": 0.85698497, + "learning_rate": 0.0009297576199013063, + "loss": 0.86879343, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.18457031, + "step": 1017, + "time_per_iteration": 2.7503206729888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151973, + "balance_loss_mlp": 1.13890779, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.05607404145793752, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74154103, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.13085938, + "step": 1018, + "time_per_iteration": 4.931609153747559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106819, + "balance_loss_mlp": 1.09365869, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.04672191734885249, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80533117, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.13183594, + "step": 1019, + "time_per_iteration": 5.336720705032349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228797, + "balance_loss_mlp": 1.21011734, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.07997087287444872, + "language_loss": 0.86300683, + "learning_rate": 0.0009292791720892659, + "loss": 0.8752948, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.18664551, + "step": 1020, + "time_per_iteration": 2.8861892223358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221818, + "balance_loss_mlp": 1.20275593, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.08883950328468299, + "language_loss": 0.88082206, + "learning_rate": 0.0009291193560807218, + "loss": 0.89304024, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.19055176, + "step": 1021, + "time_per_iteration": 2.6382570266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209623, + "balance_loss_mlp": 1.19078755, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.07890952504822618, + "language_loss": 0.86793423, + "learning_rate": 0.0009289593734732688, + "loss": 0.88003045, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.18811035, + "step": 1022, + "time_per_iteration": 2.6261141300201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185276, + "balance_loss_mlp": 1.16670358, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.0835325264325779, + "language_loss": 0.93570763, + "learning_rate": 0.0009287992243290175, + "loss": 0.94756043, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.18579102, + "step": 1023, + "time_per_iteration": 2.515672445297241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161758, + "balance_loss_mlp": 1.14213622, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.07747777445836627, + "language_loss": 0.9021076, + "learning_rate": 0.0009286389087101435, + "loss": 0.9137252, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.19604492, + "step": 1024, + "time_per_iteration": 2.8165409564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144138, + "balance_loss_mlp": 1.12458754, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.093529298896635, + "language_loss": 0.88402045, + "learning_rate": 0.0009284784266788864, + "loss": 0.8954618, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.19543457, + "step": 1025, + "time_per_iteration": 2.746727705001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143253, + "balance_loss_mlp": 1.12456095, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.07377516343298976, + "language_loss": 0.92142463, + "learning_rate": 0.0009283177782975512, + "loss": 0.9328571, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.18688965, + "step": 1026, + "time_per_iteration": 3.0783705711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125598, + "balance_loss_mlp": 1.1064887, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.09283572483169282, + "language_loss": 0.87607288, + "learning_rate": 0.000928156963628507, + "loss": 0.8873288, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.19116211, + "step": 1027, + "time_per_iteration": 2.6074790954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119339, + "balance_loss_mlp": 1.09947884, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.17318139898935403, + "language_loss": 0.87847698, + "learning_rate": 0.0009279959827341877, + "loss": 0.88967031, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.1986084, + "step": 1028, + "time_per_iteration": 2.786883592605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122475, + "balance_loss_mlp": 1.10186362, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.09725837933244906, + "language_loss": 0.87463772, + "learning_rate": 0.0009278348356770915, + "loss": 0.88586247, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.20617676, + "step": 1029, + "time_per_iteration": 2.6152124404907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115903, + "balance_loss_mlp": 1.09576869, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.09726879406227856, + "language_loss": 0.85104239, + "learning_rate": 0.0009276735225197814, + "loss": 0.86220145, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.20129395, + "step": 1030, + "time_per_iteration": 2.6491973400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140863, + "balance_loss_mlp": 1.12079978, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.07981294302307375, + "language_loss": 0.85465813, + "learning_rate": 0.0009275120433248847, + "loss": 0.86606669, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.20056152, + "step": 1031, + "time_per_iteration": 2.7181904315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170044, + "balance_loss_mlp": 1.14986157, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.08870117223998657, + "language_loss": 0.85574758, + "learning_rate": 0.0009273503981550931, + "loss": 0.86744803, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.20178223, + "step": 1032, + "time_per_iteration": 3.15751576423645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210574, + "balance_loss_mlp": 1.19066548, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.10622365116136065, + "language_loss": 0.86958814, + "learning_rate": 0.0009271885870731626, + "loss": 0.88169384, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.19909668, + "step": 1033, + "time_per_iteration": 2.513871431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124371, + "balance_loss_mlp": 1.22355127, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.12163862472720371, + "language_loss": 0.88120484, + "learning_rate": 0.0009270266101419143, + "loss": 0.89364195, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.20153809, + "step": 1034, + "time_per_iteration": 2.6154308319091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233971, + "balance_loss_mlp": 1.21453989, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.13626001105869123, + "language_loss": 0.84950191, + "learning_rate": 0.0009268644674242328, + "loss": 0.86184162, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.19433594, + "step": 1035, + "time_per_iteration": 2.706982135772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220957, + "balance_loss_mlp": 1.20152593, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09310216058180905, + "language_loss": 0.80796313, + "learning_rate": 0.0009267021589830678, + "loss": 0.82017273, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.19421387, + "step": 1036, + "time_per_iteration": 2.641144275665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300787, + "balance_loss_mlp": 1.28457427, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.08257719551105532, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78927869, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 5.017476558685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198691, + "balance_loss_mlp": 1.17903364, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08600893320147879, + "language_loss": 0.92715919, + "learning_rate": 0.000926377045182406, + "loss": 0.93914616, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.19641113, + "step": 1038, + "time_per_iteration": 2.939668893814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215159, + "balance_loss_mlp": 1.19595408, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.24386100452943713, + "language_loss": 0.87511599, + "learning_rate": 0.0009262142399491296, + "loss": 0.88726759, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.19189453, + "step": 1039, + "time_per_iteration": 3.0862977504730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248948, + "balance_loss_mlp": 1.22932601, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09408226392225982, + "language_loss": 0.87996912, + "learning_rate": 0.0009260512692448105, + "loss": 0.89245868, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.19604492, + "step": 1040, + "time_per_iteration": 2.711160182952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288003, + "balance_loss_mlp": 1.26749945, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.13301921079143278, + "language_loss": 0.84115559, + "learning_rate": 0.000925888133132719, + "loss": 0.85403562, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.20507812, + "step": 1041, + "time_per_iteration": 2.740140199661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166251, + "balance_loss_mlp": 1.1515646, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.059408002972858115, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8077668, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14648438, + "step": 1042, + "time_per_iteration": 4.983680009841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318672, + "balance_loss_mlp": 1.29690433, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.1163225797864763, + "language_loss": 0.81054026, + "learning_rate": 0.0009255613649386244, + "loss": 0.82372701, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.21777344, + "step": 1043, + "time_per_iteration": 2.6790683269500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300915, + "balance_loss_mlp": 1.27936232, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.10848871275509671, + "language_loss": 0.78969169, + "learning_rate": 0.0009253977329834838, + "loss": 0.80270082, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.21569824, + "step": 1044, + "time_per_iteration": 2.6970701217651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286746, + "balance_loss_mlp": 1.26458514, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.09565462118383694, + "language_loss": 0.86161876, + "learning_rate": 0.0009252339358742965, + "loss": 0.87448621, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.22167969, + "step": 1045, + "time_per_iteration": 2.87453556060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129502, + "balance_loss_mlp": 1.2733593, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.10796199739740596, + "language_loss": 0.83195245, + "learning_rate": 0.000925069973674654, + "loss": 0.84490263, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.21679688, + "step": 1046, + "time_per_iteration": 2.6612823009490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275399, + "balance_loss_mlp": 1.25408411, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.06722367899146847, + "language_loss": 0.88250053, + "learning_rate": 0.000924905846448212, + "loss": 0.89525455, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.21325684, + "step": 1047, + "time_per_iteration": 2.730875015258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292917, + "balance_loss_mlp": 1.27123272, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.09038052031526789, + "language_loss": 0.85797572, + "learning_rate": 0.0009247415542586906, + "loss": 0.87090492, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.21691895, + "step": 1048, + "time_per_iteration": 2.8412506580352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248583, + "balance_loss_mlp": 1.22672033, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.08064336148566398, + "language_loss": 0.83021247, + "learning_rate": 0.0009245770971698735, + "loss": 0.84269828, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.21875, + "step": 1049, + "time_per_iteration": 4.440186023712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237632, + "balance_loss_mlp": 1.21671033, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.08794152426297831, + "language_loss": 0.88490599, + "learning_rate": 0.0009244124752456087, + "loss": 0.89728236, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.20922852, + "step": 1050, + "time_per_iteration": 2.529827833175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224991, + "balance_loss_mlp": 1.20434391, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.07833105787078826, + "language_loss": 0.85121548, + "learning_rate": 0.0009242476885498081, + "loss": 0.86346543, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.20654297, + "step": 1051, + "time_per_iteration": 2.7487235069274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201681, + "balance_loss_mlp": 1.18077159, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.09537947845979083, + "language_loss": 0.80832058, + "learning_rate": 0.0009240827371464474, + "loss": 0.82033736, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.20922852, + "step": 1052, + "time_per_iteration": 2.570289373397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190217, + "balance_loss_mlp": 1.16978419, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.0749559041873476, + "language_loss": 0.83869404, + "learning_rate": 0.0009239176210995666, + "loss": 0.85059625, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.2043457, + "step": 1053, + "time_per_iteration": 3.48331880569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164732, + "balance_loss_mlp": 1.14463329, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.08759256892165929, + "language_loss": 0.9366219, + "learning_rate": 0.0009237523404732695, + "loss": 0.94826925, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.2010498, + "step": 1054, + "time_per_iteration": 2.8900768756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152229, + "balance_loss_mlp": 1.13102162, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.08554891996887364, + "language_loss": 0.84106672, + "learning_rate": 0.0009235868953317235, + "loss": 0.85258889, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.21191406, + "step": 1055, + "time_per_iteration": 2.805739402770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152429, + "balance_loss_mlp": 1.1321516, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.08283401132516657, + "language_loss": 0.84830916, + "learning_rate": 0.0009234212857391602, + "loss": 0.85983348, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.20275879, + "step": 1056, + "time_per_iteration": 3.2523794174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150661, + "balance_loss_mlp": 1.13000214, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.08956025084292601, + "language_loss": 0.88911903, + "learning_rate": 0.000923255511759875, + "loss": 0.90062559, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.20666504, + "step": 1057, + "time_per_iteration": 2.7904763221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144001, + "balance_loss_mlp": 1.12379456, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.0943960049444156, + "language_loss": 0.84853089, + "learning_rate": 0.000923089573458227, + "loss": 0.85997093, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.2019043, + "step": 1058, + "time_per_iteration": 2.8817007541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152473, + "balance_loss_mlp": 1.13152814, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.0957717786757319, + "language_loss": 0.83558518, + "learning_rate": 0.0009229234708986392, + "loss": 0.84710991, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.20947266, + "step": 1059, + "time_per_iteration": 2.9059059619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179467, + "balance_loss_mlp": 1.1632545, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.05660436116329576, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82846367, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.16210938, + "step": 1060, + "time_per_iteration": 4.709235429763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158087, + "balance_loss_mlp": 1.13642621, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.07273861691254356, + "language_loss": 0.84919071, + "learning_rate": 0.0009225907732636548, + "loss": 0.86077166, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.2166748, + "step": 1061, + "time_per_iteration": 2.7832870483398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170458, + "balance_loss_mlp": 1.14922678, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.10826308082162117, + "language_loss": 0.86149454, + "learning_rate": 0.0009224241783174227, + "loss": 0.87319911, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.21252441, + "step": 1062, + "time_per_iteration": 2.7493624687194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116711, + "balance_loss_mlp": 1.14574718, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.0807963285895634, + "language_loss": 0.85689318, + "learning_rate": 0.0009222574193715802, + "loss": 0.86856437, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.21374512, + "step": 1063, + "time_per_iteration": 2.8018240928649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159842, + "balance_loss_mlp": 1.13889694, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.07340119955686962, + "language_loss": 0.85735941, + "learning_rate": 0.000922090496490869, + "loss": 0.86895782, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.20947266, + "step": 1064, + "time_per_iteration": 2.765749931335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152698, + "balance_loss_mlp": 1.13164544, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.07242145518951734, + "language_loss": 0.89867234, + "learning_rate": 0.0009219234097400937, + "loss": 0.9101994, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.21057129, + "step": 1065, + "time_per_iteration": 2.8627817630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114786, + "balance_loss_mlp": 1.12674773, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.08464925787423999, + "language_loss": 0.83060288, + "learning_rate": 0.0009217561591841237, + "loss": 0.84208149, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.21130371, + "step": 1066, + "time_per_iteration": 3.3423283100128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142136, + "balance_loss_mlp": 1.12129867, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.08558035413219019, + "language_loss": 0.80671912, + "learning_rate": 0.0009215887448878913, + "loss": 0.81814051, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.20849609, + "step": 1067, + "time_per_iteration": 2.5908420085906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133301, + "balance_loss_mlp": 1.11204648, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.08226430294551884, + "language_loss": 0.8469618, + "learning_rate": 0.0009214211669163922, + "loss": 0.85829484, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.21264648, + "step": 1068, + "time_per_iteration": 2.70798397064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136986, + "balance_loss_mlp": 1.11625564, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.08433693913464968, + "language_loss": 0.9379245, + "learning_rate": 0.0009212534253346862, + "loss": 0.94929433, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.20727539, + "step": 1069, + "time_per_iteration": 2.7776713371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129003, + "balance_loss_mlp": 1.10772455, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.09450930819857521, + "language_loss": 0.8384515, + "learning_rate": 0.0009210855202078964, + "loss": 0.84974158, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.2130127, + "step": 1070, + "time_per_iteration": 2.6283328533172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130904, + "balance_loss_mlp": 1.11017382, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.08132695111234396, + "language_loss": 0.86854172, + "learning_rate": 0.0009209174516012091, + "loss": 0.87985075, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.20751953, + "step": 1071, + "time_per_iteration": 2.535447120666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133709, + "balance_loss_mlp": 1.11270416, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.11111326067788187, + "language_loss": 0.88662505, + "learning_rate": 0.0009207492195798747, + "loss": 0.89796209, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.21008301, + "step": 1072, + "time_per_iteration": 2.7883682250976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144236, + "balance_loss_mlp": 1.12275457, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.10819626667436329, + "language_loss": 0.84654653, + "learning_rate": 0.0009205808242092061, + "loss": 0.85798889, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.21484375, + "step": 1073, + "time_per_iteration": 2.6761436462402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166479, + "balance_loss_mlp": 1.1445806, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.10070475961417262, + "language_loss": 0.82806575, + "learning_rate": 0.0009204122655545808, + "loss": 0.8397305, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.21911621, + "step": 1074, + "time_per_iteration": 3.326646089553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169219, + "balance_loss_mlp": 1.14714098, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.07526963641038939, + "language_loss": 0.80370897, + "learning_rate": 0.0009202435436814388, + "loss": 0.8154012, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.22070312, + "step": 1075, + "time_per_iteration": 2.718374013900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117617, + "balance_loss_mlp": 1.15484309, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.08141199692544657, + "language_loss": 0.89125872, + "learning_rate": 0.0009200746586552836, + "loss": 0.90302044, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.21350098, + "step": 1076, + "time_per_iteration": 2.9237890243530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116406, + "balance_loss_mlp": 1.14320993, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.08915437819246362, + "language_loss": 0.83578765, + "learning_rate": 0.0009199056105416825, + "loss": 0.8474282, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.20861816, + "step": 1077, + "time_per_iteration": 3.1017873287200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174288, + "balance_loss_mlp": 1.15383148, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.08235458210831342, + "language_loss": 0.8621031, + "learning_rate": 0.0009197363994062654, + "loss": 0.87384599, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.20458984, + "step": 1078, + "time_per_iteration": 2.832416296005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115862, + "balance_loss_mlp": 1.13828301, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.14524602294533026, + "language_loss": 0.8378703, + "learning_rate": 0.0009195670253147262, + "loss": 0.84945655, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.20336914, + "step": 1079, + "time_per_iteration": 2.9912445545196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130057, + "balance_loss_mlp": 1.11056602, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.07398728313760368, + "language_loss": 0.81629539, + "learning_rate": 0.0009193974883328216, + "loss": 0.82759595, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.19470215, + "step": 1080, + "time_per_iteration": 2.636516809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.12286365, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.08145379169955597, + "language_loss": 0.86828917, + "learning_rate": 0.0009192277885263718, + "loss": 0.87971467, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.19665527, + "step": 1081, + "time_per_iteration": 2.7361197471618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137254, + "balance_loss_mlp": 1.11765575, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.09498097190043973, + "language_loss": 0.85732365, + "learning_rate": 0.0009190579259612602, + "loss": 0.86869615, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.19580078, + "step": 1082, + "time_per_iteration": 3.3791959285736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156614, + "balance_loss_mlp": 1.13621759, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.1488703614850634, + "language_loss": 0.86399055, + "learning_rate": 0.000918887900703433, + "loss": 0.87555665, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.20397949, + "step": 1083, + "time_per_iteration": 2.8133795261383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.129125, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.0859641513447352, + "language_loss": 0.90200919, + "learning_rate": 0.0009187177128188999, + "loss": 0.91349459, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.19396973, + "step": 1084, + "time_per_iteration": 2.4999842643737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286106, + "balance_loss_mlp": 1.27151525, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.08105811039849961, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78442645, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.14550781, + "step": 1085, + "time_per_iteration": 4.8958563804626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153669, + "balance_loss_mlp": 1.13441706, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.08197687066157772, + "language_loss": 0.85811758, + "learning_rate": 0.000918376849434071, + "loss": 0.86965424, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.19250488, + "step": 1086, + "time_per_iteration": 2.5344736576080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118214, + "balance_loss_mlp": 1.16158867, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.10825532619194118, + "language_loss": 0.90649915, + "learning_rate": 0.0009182061740661098, + "loss": 0.9183206, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.20556641, + "step": 1087, + "time_per_iteration": 2.5707151889801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178442, + "balance_loss_mlp": 1.15811718, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.08160475290131898, + "language_loss": 0.84683895, + "learning_rate": 0.0009180353363361127, + "loss": 0.85862345, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.203125, + "step": 1088, + "time_per_iteration": 3.137329339981079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174451, + "balance_loss_mlp": 1.15374422, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.10140667942926032, + "language_loss": 0.81920874, + "learning_rate": 0.0009178643363104044, + "loss": 0.83095324, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.20715332, + "step": 1089, + "time_per_iteration": 3.1493358612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147137, + "balance_loss_mlp": 1.12660897, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.10442412310556573, + "language_loss": 0.90355861, + "learning_rate": 0.0009176931740553735, + "loss": 0.91503, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.20532227, + "step": 1090, + "time_per_iteration": 2.5557990074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139677, + "balance_loss_mlp": 1.11933959, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.17656839042402708, + "language_loss": 0.82232946, + "learning_rate": 0.0009175218496374708, + "loss": 0.83372623, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.20349121, + "step": 1091, + "time_per_iteration": 3.3492214679718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132775, + "balance_loss_mlp": 1.11287904, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.09269359078641065, + "language_loss": 0.85681468, + "learning_rate": 0.0009173503631232103, + "loss": 0.86814249, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.19885254, + "step": 1092, + "time_per_iteration": 3.396247386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.11091864, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.09283462310009857, + "language_loss": 0.81684232, + "learning_rate": 0.0009171787145791691, + "loss": 0.82815444, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.20288086, + "step": 1093, + "time_per_iteration": 3.2441000938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.11279404, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.14183927725725606, + "language_loss": 0.79456544, + "learning_rate": 0.000917006904071987, + "loss": 0.80589247, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.19897461, + "step": 1094, + "time_per_iteration": 2.658992052078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140578, + "balance_loss_mlp": 1.12040734, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.07963562881698232, + "language_loss": 0.86590552, + "learning_rate": 0.0009168349316683669, + "loss": 0.87731135, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.20166016, + "step": 1095, + "time_per_iteration": 2.7208545207977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157939, + "balance_loss_mlp": 1.1382103, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.06948100196361624, + "language_loss": 0.82885933, + "learning_rate": 0.0009166627974350741, + "loss": 0.84043866, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.19714355, + "step": 1096, + "time_per_iteration": 2.879690647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158751, + "balance_loss_mlp": 1.13850892, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.07894738519235364, + "language_loss": 0.89620626, + "learning_rate": 0.0009164905014389373, + "loss": 0.90779376, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.20239258, + "step": 1097, + "time_per_iteration": 2.7915890216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174722, + "balance_loss_mlp": 1.15442061, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.08089010798718275, + "language_loss": 0.86655492, + "learning_rate": 0.0009163180437468476, + "loss": 0.87830216, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.20300293, + "step": 1098, + "time_per_iteration": 2.671910285949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160878, + "balance_loss_mlp": 1.14083886, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.1273171739233691, + "language_loss": 0.85848475, + "learning_rate": 0.000916145424425759, + "loss": 0.87009346, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.20031738, + "step": 1099, + "time_per_iteration": 2.718719959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138682, + "balance_loss_mlp": 1.11927521, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.11827895321179892, + "language_loss": 0.90551817, + "learning_rate": 0.0009159726435426885, + "loss": 0.91690505, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.19384766, + "step": 1100, + "time_per_iteration": 4.622005939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096537, + "balance_loss_mlp": 1.07577038, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.08009025902543959, + "language_loss": 0.90283167, + "learning_rate": 0.0009157997011647154, + "loss": 0.91379714, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.2076416, + "step": 1101, + "time_per_iteration": 2.605741262435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.0622586, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.10006580652166666, + "language_loss": 0.85976642, + "learning_rate": 0.0009156265973589817, + "loss": 0.87059283, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.20385742, + "step": 1102, + "time_per_iteration": 2.7997629642486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082906, + "balance_loss_mlp": 1.06256843, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.08882618780300273, + "language_loss": 0.89710194, + "learning_rate": 0.0009154533321926926, + "loss": 0.90793097, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.20336914, + "step": 1103, + "time_per_iteration": 2.6505167484283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082845, + "balance_loss_mlp": 1.06240106, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.08104008133152642, + "language_loss": 0.87105876, + "learning_rate": 0.0009152799057331156, + "loss": 0.88188726, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.20446777, + "step": 1104, + "time_per_iteration": 3.16381573677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_mlp": 1.06503153, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.1303184369793021, + "language_loss": 0.90978825, + "learning_rate": 0.0009151063180475805, + "loss": 0.92063844, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.1998291, + "step": 1105, + "time_per_iteration": 2.519392490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081303, + "balance_loss_mlp": 1.06139469, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.09253503988008102, + "language_loss": 0.84230483, + "learning_rate": 0.0009149325692034803, + "loss": 0.85311788, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.19897461, + "step": 1106, + "time_per_iteration": 2.623030662536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122847, + "balance_loss_mlp": 1.11054456, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.03239256029122438, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80326271, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.12304688, + "step": 1107, + "time_per_iteration": 4.865934610366821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095405, + "balance_loss_mlp": 1.07612848, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.08663251382833077, + "language_loss": 0.87545854, + "learning_rate": 0.0009145845883094678, + "loss": 0.88641262, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.19262695, + "step": 1108, + "time_per_iteration": 3.0644633769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106513, + "balance_loss_mlp": 1.08767843, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.09154471330204571, + "language_loss": 0.84864843, + "learning_rate": 0.000914410356394654, + "loss": 0.85971349, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.18798828, + "step": 1109, + "time_per_iteration": 2.7818005084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111664, + "balance_loss_mlp": 1.09850883, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.05901208331379503, + "language_loss": 0.84397328, + "learning_rate": 0.0009142359635914709, + "loss": 0.85513967, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.18151855, + "step": 1110, + "time_per_iteration": 3.0699398517608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132455, + "balance_loss_mlp": 1.11437058, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.07045633933043649, + "language_loss": 0.84396905, + "learning_rate": 0.0009140614099676245, + "loss": 0.85529351, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.18103027, + "step": 1111, + "time_per_iteration": 2.6896469593048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144685, + "balance_loss_mlp": 1.12654102, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.07609754946919366, + "language_loss": 0.82333195, + "learning_rate": 0.0009138866955908821, + "loss": 0.83477879, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.18151855, + "step": 1112, + "time_per_iteration": 2.9167656898498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173372, + "balance_loss_mlp": 1.15541935, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.07536024812721688, + "language_loss": 0.80650687, + "learning_rate": 0.0009137118205290738, + "loss": 0.81824064, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.17956543, + "step": 1113, + "time_per_iteration": 3.038858652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173379, + "balance_loss_mlp": 1.15471053, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.08578166607433227, + "language_loss": 0.9008798, + "learning_rate": 0.0009135367848500924, + "loss": 0.91261363, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.18652344, + "step": 1114, + "time_per_iteration": 2.5301332473754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183524, + "balance_loss_mlp": 1.16561842, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.097679735811004, + "language_loss": 0.86396897, + "learning_rate": 0.0009133615886218927, + "loss": 0.87580419, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.17932129, + "step": 1115, + "time_per_iteration": 2.7787675857543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181259, + "balance_loss_mlp": 1.16279316, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.08896664083513224, + "language_loss": 0.87571919, + "learning_rate": 0.0009131862319124917, + "loss": 0.88753176, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.18469238, + "step": 1116, + "time_per_iteration": 2.7031164169311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177922, + "balance_loss_mlp": 1.15970659, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.07771910148821705, + "language_loss": 0.8379603, + "learning_rate": 0.0009130107147899691, + "loss": 0.84973955, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.18237305, + "step": 1117, + "time_per_iteration": 2.7842912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180049, + "balance_loss_mlp": 1.16186976, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.07252648730513606, + "language_loss": 0.85351467, + "learning_rate": 0.0009128350373224665, + "loss": 0.86531514, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.1817627, + "step": 1118, + "time_per_iteration": 2.547067880630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174302, + "balance_loss_mlp": 1.1582799, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.06807222888709992, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.8263073, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.16015625, + "step": 1119, + "time_per_iteration": 4.686914443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191791, + "balance_loss_mlp": 1.1730994, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07584418562153701, + "language_loss": 0.85298818, + "learning_rate": 0.0009124832016254005, + "loss": 0.86490607, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.18676758, + "step": 1120, + "time_per_iteration": 2.594407558441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.16062903, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.07950209413805702, + "language_loss": 0.87972558, + "learning_rate": 0.0009123070435324316, + "loss": 0.89152032, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.18835449, + "step": 1121, + "time_per_iteration": 2.8215177059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068291, + "balance_loss_mlp": 1.05379486, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.028005803680130233, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78944069, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.14453125, + "step": 1122, + "time_per_iteration": 5.0041632652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159249, + "balance_loss_mlp": 1.14079511, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.08251943361984397, + "language_loss": 0.86073762, + "learning_rate": 0.0009119542471995752, + "loss": 0.87233007, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.18432617, + "step": 1123, + "time_per_iteration": 2.862286329269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163328, + "balance_loss_mlp": 1.14537501, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.09258223897772182, + "language_loss": 0.81420332, + "learning_rate": 0.0009117776090966554, + "loss": 0.8258366, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.17956543, + "step": 1124, + "time_per_iteration": 2.957061767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178568, + "balance_loss_mlp": 1.15982795, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.08713542738122697, + "language_loss": 0.86376691, + "learning_rate": 0.0009116008111274899, + "loss": 0.87555259, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.18725586, + "step": 1125, + "time_per_iteration": 3.2553656101226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134598, + "balance_loss_mlp": 1.12191415, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.04404830998294008, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80241525, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.12695312, + "step": 1126, + "time_per_iteration": 4.808468818664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178721, + "balance_loss_mlp": 1.16074455, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.11245559393918578, + "language_loss": 0.8463136, + "learning_rate": 0.0009112467358650396, + "loss": 0.85810077, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.17993164, + "step": 1127, + "time_per_iteration": 3.2135119438171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203773, + "balance_loss_mlp": 1.18573689, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.12344639465473216, + "language_loss": 0.86497682, + "learning_rate": 0.0009110694587092192, + "loss": 0.87701452, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.18041992, + "step": 1128, + "time_per_iteration": 2.76655650138855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187728, + "balance_loss_mlp": 1.17007267, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.08979647183610162, + "language_loss": 0.81230694, + "learning_rate": 0.0009108920219620815, + "loss": 0.82418424, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.17675781, + "step": 1129, + "time_per_iteration": 2.654778242111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213499, + "balance_loss_mlp": 1.19534314, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.09421163362280094, + "language_loss": 0.89139944, + "learning_rate": 0.0009107144256925133, + "loss": 0.90353441, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.18164062, + "step": 1130, + "time_per_iteration": 2.6828513145446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118696, + "balance_loss_mlp": 1.1690309, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.10043397732842237, + "language_loss": 0.82135975, + "learning_rate": 0.0009105366699694638, + "loss": 0.83322936, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.17944336, + "step": 1131, + "time_per_iteration": 2.7368264198303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156124, + "balance_loss_mlp": 1.13807523, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.06866995192565088, + "language_loss": 0.8126269, + "learning_rate": 0.0009103587548619439, + "loss": 0.82418817, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.18066406, + "step": 1132, + "time_per_iteration": 2.8550221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127864, + "balance_loss_mlp": 1.10951805, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.07626365128544196, + "language_loss": 0.85966831, + "learning_rate": 0.0009101806804390261, + "loss": 0.87094694, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.18359375, + "step": 1133, + "time_per_iteration": 2.865067720413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104426, + "balance_loss_mlp": 1.08616304, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.0835029551695644, + "language_loss": 0.89787459, + "learning_rate": 0.0009100024467698453, + "loss": 0.90891886, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.18261719, + "step": 1134, + "time_per_iteration": 2.587308645248413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107106, + "balance_loss_mlp": 1.08858073, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.1261525750794289, + "language_loss": 0.8228271, + "learning_rate": 0.0009098240539235981, + "loss": 0.83389813, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.1854248, + "step": 1135, + "time_per_iteration": 2.672178268432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118389, + "balance_loss_mlp": 1.10042465, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.07190677595982913, + "language_loss": 0.87357873, + "learning_rate": 0.0009096455019695423, + "loss": 0.88476264, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.17980957, + "step": 1136, + "time_per_iteration": 2.7987098693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132882, + "balance_loss_mlp": 1.1147505, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.07940180090442328, + "language_loss": 0.89624888, + "learning_rate": 0.000909466790976998, + "loss": 0.90757769, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.18139648, + "step": 1137, + "time_per_iteration": 2.477332830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135864, + "balance_loss_mlp": 1.11760151, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.0834179991172278, + "language_loss": 0.82063508, + "learning_rate": 0.0009092879210153473, + "loss": 0.83199376, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.18261719, + "step": 1138, + "time_per_iteration": 3.12052321434021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144384, + "balance_loss_mlp": 1.12646723, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.08144398942367967, + "language_loss": 0.88541782, + "learning_rate": 0.0009091088921540333, + "loss": 0.89686167, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.17919922, + "step": 1139, + "time_per_iteration": 2.616718292236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059921, + "balance_loss_mlp": 1.04833436, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.03144960121690337, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76568598, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.11572266, + "step": 1140, + "time_per_iteration": 4.950219392776489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159199, + "balance_loss_mlp": 1.14099586, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.08175747516698374, + "language_loss": 0.84013134, + "learning_rate": 0.0009087503580104985, + "loss": 0.85172331, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.18212891, + "step": 1141, + "time_per_iteration": 2.7156832218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169076, + "balance_loss_mlp": 1.15111113, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.09158845445189351, + "language_loss": 0.7908268, + "learning_rate": 0.0009085708528674728, + "loss": 0.80251753, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.17993164, + "step": 1142, + "time_per_iteration": 2.7931153774261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164556, + "balance_loss_mlp": 1.14653111, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.08286913258708346, + "language_loss": 0.86118239, + "learning_rate": 0.0009083911891031745, + "loss": 0.87282795, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.18041992, + "step": 1143, + "time_per_iteration": 3.116783857345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117374, + "balance_loss_mlp": 1.15575087, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.10598120448533326, + "language_loss": 0.91152728, + "learning_rate": 0.0009082113667873553, + "loss": 0.92326462, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.18005371, + "step": 1144, + "time_per_iteration": 3.1333653926849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165589, + "balance_loss_mlp": 1.14781499, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.09559133609898889, + "language_loss": 0.9010762, + "learning_rate": 0.0009080313859898283, + "loss": 0.91273212, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.17773438, + "step": 1145, + "time_per_iteration": 2.5269837379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158069, + "balance_loss_mlp": 1.13981819, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.08379728657337264, + "language_loss": 0.91627228, + "learning_rate": 0.0009078512467804684, + "loss": 0.92785299, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.18249512, + "step": 1146, + "time_per_iteration": 2.6481103897094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13930488, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.08494148813195015, + "language_loss": 0.90029317, + "learning_rate": 0.0009076709492292119, + "loss": 0.91186154, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.17541504, + "step": 1147, + "time_per_iteration": 2.659444808959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156044, + "balance_loss_mlp": 1.1380074, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.08635236800942281, + "language_loss": 0.88836294, + "learning_rate": 0.0009074904934060562, + "loss": 0.89992332, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.18041992, + "step": 1148, + "time_per_iteration": 2.6803669929504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154403, + "balance_loss_mlp": 1.13666439, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.0889091403520225, + "language_loss": 0.84333098, + "learning_rate": 0.0009073098793810607, + "loss": 0.85487497, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.17749023, + "step": 1149, + "time_per_iteration": 2.9655888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142518, + "balance_loss_mlp": 1.12488723, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.1004212836055253, + "language_loss": 0.88171208, + "learning_rate": 0.000907129107224346, + "loss": 0.89313722, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.17651367, + "step": 1150, + "time_per_iteration": 2.7072501182556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114998, + "balance_loss_mlp": 1.13255119, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.06570196764831916, + "language_loss": 0.88176614, + "learning_rate": 0.0009069481770060939, + "loss": 0.8932659, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.17443848, + "step": 1151, + "time_per_iteration": 2.685103178024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154372, + "balance_loss_mlp": 1.13708711, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.09650141097201487, + "language_loss": 0.83268076, + "learning_rate": 0.000906767088796548, + "loss": 0.84422451, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.17297363, + "step": 1152, + "time_per_iteration": 3.4740118980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116577, + "balance_loss_mlp": 1.14875841, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.08954893541671843, + "language_loss": 0.86883795, + "learning_rate": 0.0009065858426660127, + "loss": 0.88049567, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.17028809, + "step": 1153, + "time_per_iteration": 2.6959545612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162713, + "balance_loss_mlp": 1.14552331, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.08642937771359972, + "language_loss": 0.84477949, + "learning_rate": 0.0009064044386848543, + "loss": 0.85640663, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.17199707, + "step": 1154, + "time_per_iteration": 2.9327309131622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148113, + "balance_loss_mlp": 1.13044643, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.10097530204718137, + "language_loss": 0.8819679, + "learning_rate": 0.0009062228769234997, + "loss": 0.89344907, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.17675781, + "step": 1155, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131691, + "balance_loss_mlp": 1.11384535, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.08570412042921306, + "language_loss": 0.80458236, + "learning_rate": 0.0009060411574524376, + "loss": 0.81589925, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.17858887, + "step": 1156, + "time_per_iteration": 2.6829988956451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121054, + "balance_loss_mlp": 1.10336328, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.09330017299295373, + "language_loss": 0.87879562, + "learning_rate": 0.0009058592803422178, + "loss": 0.89000618, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.17712402, + "step": 1157, + "time_per_iteration": 3.181018829345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121897, + "balance_loss_mlp": 1.10911822, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.048914379983556036, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79832184, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.12792969, + "step": 1158, + "time_per_iteration": 4.887088775634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115665, + "balance_loss_mlp": 1.0982244, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.0696072904806853, + "language_loss": 0.89700031, + "learning_rate": 0.00090549505348681, + "loss": 0.90815699, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.17456055, + "step": 1159, + "time_per_iteration": 2.598071813583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112274, + "balance_loss_mlp": 1.09486985, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.12380497141241992, + "language_loss": 0.83892691, + "learning_rate": 0.0009053127038830275, + "loss": 0.85004961, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.17407227, + "step": 1160, + "time_per_iteration": 2.972153663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105235, + "balance_loss_mlp": 1.08817601, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.11211348915152936, + "language_loss": 0.86961317, + "learning_rate": 0.000905130196922898, + "loss": 0.88066548, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.17077637, + "step": 1161, + "time_per_iteration": 2.586404800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103766, + "balance_loss_mlp": 1.08674335, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.08844003676149725, + "language_loss": 0.8712495, + "learning_rate": 0.0009049475326772769, + "loss": 0.88228714, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.17028809, + "step": 1162, + "time_per_iteration": 2.633775472640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115059, + "balance_loss_mlp": 1.09810734, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.08335674073816261, + "language_loss": 0.83002663, + "learning_rate": 0.0009047647112170811, + "loss": 0.84117723, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.16967773, + "step": 1163, + "time_per_iteration": 2.779890537261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112691, + "balance_loss_mlp": 1.11049509, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.19679577404354898, + "language_loss": 0.87137246, + "learning_rate": 0.0009045817326132876, + "loss": 0.88264161, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.16418457, + "step": 1164, + "time_per_iteration": 3.703150749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153627, + "balance_loss_mlp": 1.13630629, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.08115041291567808, + "language_loss": 0.83409214, + "learning_rate": 0.0009043985969369357, + "loss": 0.84562844, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.17333984, + "step": 1165, + "time_per_iteration": 2.8744845390319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175693, + "balance_loss_mlp": 1.15849137, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.06201627876445988, + "language_loss": 0.84104788, + "learning_rate": 0.0009042153042591245, + "loss": 0.85280478, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.17224121, + "step": 1166, + "time_per_iteration": 2.8617310523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184386, + "balance_loss_mlp": 1.16719604, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.08223980595448348, + "language_loss": 0.84917307, + "learning_rate": 0.0009040318546510146, + "loss": 0.86101699, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.17211914, + "step": 1167, + "time_per_iteration": 3.1852662563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184421, + "balance_loss_mlp": 1.16730213, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.0789242941151387, + "language_loss": 0.85142338, + "learning_rate": 0.0009038482481838275, + "loss": 0.86326754, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.17126465, + "step": 1168, + "time_per_iteration": 2.69252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179663, + "balance_loss_mlp": 1.16241312, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.05697426763288438, + "language_loss": 0.86826229, + "learning_rate": 0.0009036644849288455, + "loss": 0.88005894, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.17260742, + "step": 1169, + "time_per_iteration": 3.1488285064697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174012, + "balance_loss_mlp": 1.15652442, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.08495924937221859, + "language_loss": 0.85084724, + "learning_rate": 0.0009034805649574118, + "loss": 0.86258733, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.1751709, + "step": 1170, + "time_per_iteration": 2.685328722000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183548, + "balance_loss_mlp": 1.16578627, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.11014400581169416, + "language_loss": 0.85017669, + "learning_rate": 0.0009032964883409308, + "loss": 0.86201215, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.17785645, + "step": 1171, + "time_per_iteration": 2.879601240158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.10170817, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.052120324196256125, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74164546, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.12255859, + "step": 1172, + "time_per_iteration": 5.038167715072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198228, + "balance_loss_mlp": 1.18021595, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.07370263777730128, + "language_loss": 0.87101096, + "learning_rate": 0.0009029278654587462, + "loss": 0.88299322, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.18017578, + "step": 1173, + "time_per_iteration": 2.627659559249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207558, + "balance_loss_mlp": 1.1888895, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.09375965630696953, + "language_loss": 0.82013619, + "learning_rate": 0.0009027433193361548, + "loss": 0.83221173, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.18652344, + "step": 1174, + "time_per_iteration": 2.8188316822052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191442, + "balance_loss_mlp": 1.17263079, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.09826481383842127, + "language_loss": 0.8677392, + "learning_rate": 0.00090255861685474, + "loss": 0.87965363, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.18798828, + "step": 1175, + "time_per_iteration": 2.7677559852600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187652, + "balance_loss_mlp": 1.16895974, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.09211807586067215, + "language_loss": 0.90504396, + "learning_rate": 0.0009023737580862095, + "loss": 0.91692042, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.18676758, + "step": 1176, + "time_per_iteration": 2.54901123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191354, + "balance_loss_mlp": 1.17276883, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0881916579324479, + "language_loss": 0.83226693, + "learning_rate": 0.0009021887431023321, + "loss": 0.84418046, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.18566895, + "step": 1177, + "time_per_iteration": 2.6121795177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174332, + "balance_loss_mlp": 1.15594959, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.08194623484888001, + "language_loss": 0.87241113, + "learning_rate": 0.0009020035719749369, + "loss": 0.88415444, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.18359375, + "step": 1178, + "time_per_iteration": 2.7401885986328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158606, + "balance_loss_mlp": 1.14040256, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.0813633568079927, + "language_loss": 0.77680194, + "learning_rate": 0.0009018182447759136, + "loss": 0.78838801, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.18212891, + "step": 1179, + "time_per_iteration": 3.0078771114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145135, + "balance_loss_mlp": 1.12688398, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.09172856476896407, + "language_loss": 0.79547179, + "learning_rate": 0.0009016327615772126, + "loss": 0.80692315, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.18249512, + "step": 1180, + "time_per_iteration": 2.956892251968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140365, + "balance_loss_mlp": 1.12199533, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.0875125644607483, + "language_loss": 0.87631428, + "learning_rate": 0.0009014471224508451, + "loss": 0.8877179, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.18359375, + "step": 1181, + "time_per_iteration": 2.6819214820861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140649, + "balance_loss_mlp": 1.12244546, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.12040521041324766, + "language_loss": 0.82781821, + "learning_rate": 0.0009012613274688823, + "loss": 0.8392247, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.18200684, + "step": 1182, + "time_per_iteration": 2.6545872688293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127803, + "balance_loss_mlp": 1.10971928, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.11611648539449336, + "language_loss": 0.87670434, + "learning_rate": 0.0009010753767034565, + "loss": 0.88798231, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.1809082, + "step": 1183, + "time_per_iteration": 2.5755655765533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011456, + "balance_loss_mlp": 1.12726605, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.07779286107938752, + "language_loss": 0.78790247, + "learning_rate": 0.0009008892702267599, + "loss": 0.79935843, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.18347168, + "step": 1184, + "time_per_iteration": 2.9940855503082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145741, + "balance_loss_mlp": 1.12732279, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.09447672073297446, + "language_loss": 0.88500011, + "learning_rate": 0.0009007030081110457, + "loss": 0.89645755, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.18408203, + "step": 1185, + "time_per_iteration": 2.6603288650512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143533, + "balance_loss_mlp": 1.12500811, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.0853307601225198, + "language_loss": 0.84380877, + "learning_rate": 0.000900516590428627, + "loss": 0.85524416, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.18518066, + "step": 1186, + "time_per_iteration": 2.692070484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141181, + "balance_loss_mlp": 1.12318015, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.07243217971015652, + "language_loss": 0.89009422, + "learning_rate": 0.0009003300172518778, + "loss": 0.90150601, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.17980957, + "step": 1187, + "time_per_iteration": 2.7073988914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.11980963, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.08424899879196017, + "language_loss": 0.83985436, + "learning_rate": 0.0009001432886532321, + "loss": 0.85122764, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.17529297, + "step": 1188, + "time_per_iteration": 2.9843039512634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146183, + "balance_loss_mlp": 1.12812281, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.0771143581641096, + "language_loss": 0.8654418, + "learning_rate": 0.0008999564047051843, + "loss": 0.87690365, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.18054199, + "step": 1189, + "time_per_iteration": 2.6047263145446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152979, + "balance_loss_mlp": 1.13572931, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.0974051284777214, + "language_loss": 0.85100305, + "learning_rate": 0.0008997693654802894, + "loss": 0.86253285, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.17272949, + "step": 1190, + "time_per_iteration": 2.6849515438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134691, + "balance_loss_mlp": 1.11709571, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.08474903758704144, + "language_loss": 0.86204302, + "learning_rate": 0.0008995821710511625, + "loss": 0.87338996, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.17602539, + "step": 1191, + "time_per_iteration": 2.742478132247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126818, + "balance_loss_mlp": 1.10922277, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.08571505564163927, + "language_loss": 0.84842807, + "learning_rate": 0.0008993948214904786, + "loss": 0.85969627, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.17602539, + "step": 1192, + "time_per_iteration": 2.6361818313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045247, + "balance_loss_mlp": 1.03237247, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.028329103864080232, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79467458, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.12890625, + "step": 1193, + "time_per_iteration": 4.969930171966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112876, + "balance_loss_mlp": 1.10972273, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.08612491826756107, + "language_loss": 0.78059292, + "learning_rate": 0.0008990196572654427, + "loss": 0.79188055, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.19018555, + "step": 1194, + "time_per_iteration": 2.8844966888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140316, + "balance_loss_mlp": 1.12217188, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.10153558100200434, + "language_loss": 0.87920988, + "learning_rate": 0.0008988318427467426, + "loss": 0.89061302, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.18151855, + "step": 1195, + "time_per_iteration": 2.687624931335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142082, + "balance_loss_mlp": 1.12412882, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.08230259672194101, + "language_loss": 0.86206847, + "learning_rate": 0.0008986438733877887, + "loss": 0.87348932, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.17956543, + "step": 1196, + "time_per_iteration": 3.4957938194274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153338, + "balance_loss_mlp": 1.13559973, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.06895925957333625, + "language_loss": 0.8397938, + "learning_rate": 0.0008984557492615576, + "loss": 0.85132712, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.1776123, + "step": 1197, + "time_per_iteration": 3.004096031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148862, + "balance_loss_mlp": 1.13082576, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.07382939590065767, + "language_loss": 0.89479733, + "learning_rate": 0.0008982674704410854, + "loss": 0.906286, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.18029785, + "step": 1198, + "time_per_iteration": 2.6988983154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115166, + "balance_loss_mlp": 1.13448238, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.0949037059675448, + "language_loss": 0.77658606, + "learning_rate": 0.0008980790369994682, + "loss": 0.78810263, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.17199707, + "step": 1199, + "time_per_iteration": 2.9618003368377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154837, + "balance_loss_mlp": 1.13739705, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.07145246308543461, + "language_loss": 0.87144834, + "learning_rate": 0.000897890449009863, + "loss": 0.88299668, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.17443848, + "step": 1200, + "time_per_iteration": 2.7796213626861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116547, + "balance_loss_mlp": 1.14776802, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.09854596236312584, + "language_loss": 0.89783561, + "learning_rate": 0.0008977017065454853, + "loss": 0.90949035, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.17712402, + "step": 1201, + "time_per_iteration": 2.7383389472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118456, + "balance_loss_mlp": 1.16748941, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.06681897447915772, + "language_loss": 0.79928529, + "learning_rate": 0.0008975128096796121, + "loss": 0.81113094, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.17077637, + "step": 1202, + "time_per_iteration": 2.893461227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174856, + "balance_loss_mlp": 1.15766644, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.09321616984993739, + "language_loss": 0.85471004, + "learning_rate": 0.0008973237584855794, + "loss": 0.86645865, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.17211914, + "step": 1203, + "time_per_iteration": 2.898749589920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174851, + "balance_loss_mlp": 1.15761375, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.08459599639125864, + "language_loss": 0.82237399, + "learning_rate": 0.0008971345530367832, + "loss": 0.83412254, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.17248535, + "step": 1204, + "time_per_iteration": 2.5461792945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169858, + "balance_loss_mlp": 1.15260816, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.08050630983240942, + "language_loss": 0.85032547, + "learning_rate": 0.0008969451934066799, + "loss": 0.86202407, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.17272949, + "step": 1205, + "time_per_iteration": 2.8455100059509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157737, + "balance_loss_mlp": 1.1401062, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.09118158600793376, + "language_loss": 0.79779387, + "learning_rate": 0.0008967556796687854, + "loss": 0.80937129, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.1763916, + "step": 1206, + "time_per_iteration": 2.977187395095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166868, + "balance_loss_mlp": 1.14940381, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.08470401629761377, + "language_loss": 0.83790028, + "learning_rate": 0.0008965660118966752, + "loss": 0.8495689, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.17480469, + "step": 1207, + "time_per_iteration": 2.9695510864257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164887, + "balance_loss_mlp": 1.14745879, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.07067711449707674, + "language_loss": 0.89920551, + "learning_rate": 0.0008963761901639851, + "loss": 0.9108544, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.17443848, + "step": 1208, + "time_per_iteration": 2.8432528972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164904, + "balance_loss_mlp": 1.14763093, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.07998084189671781, + "language_loss": 0.83062428, + "learning_rate": 0.0008961862145444103, + "loss": 0.84227335, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.17285156, + "step": 1209, + "time_per_iteration": 2.7639503479003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161441, + "balance_loss_mlp": 1.14392972, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.07404933879866919, + "language_loss": 0.85019284, + "learning_rate": 0.0008959960851117059, + "loss": 0.86180723, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.17541504, + "step": 1210, + "time_per_iteration": 2.639765739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142071, + "balance_loss_mlp": 1.12463081, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.06764705739880358, + "language_loss": 0.83661717, + "learning_rate": 0.0008958058019396868, + "loss": 0.8480379, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.17468262, + "step": 1211, + "time_per_iteration": 2.8551721572875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114749, + "balance_loss_mlp": 1.13016868, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.08875501668915448, + "language_loss": 0.86489981, + "learning_rate": 0.0008956153651022274, + "loss": 0.87637472, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.17333984, + "step": 1212, + "time_per_iteration": 2.7765469551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144101, + "balance_loss_mlp": 1.12625563, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.07932001584083075, + "language_loss": 0.83832914, + "learning_rate": 0.0008954247746732618, + "loss": 0.84977019, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.17858887, + "step": 1213, + "time_per_iteration": 2.6084651947021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135394, + "balance_loss_mlp": 1.11788201, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.07442615591494516, + "language_loss": 0.90398782, + "learning_rate": 0.0008952340307267837, + "loss": 0.91534173, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.17529297, + "step": 1214, + "time_per_iteration": 2.89178466796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125335, + "balance_loss_mlp": 1.10793078, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.1453012637227399, + "language_loss": 0.8336947, + "learning_rate": 0.0008950431333368468, + "loss": 0.84494805, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.17419434, + "step": 1215, + "time_per_iteration": 2.5870306491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111701, + "balance_loss_mlp": 1.09912825, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.07975417299664793, + "language_loss": 0.84537351, + "learning_rate": 0.0008948520825775634, + "loss": 0.8565436, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.17919922, + "step": 1216, + "time_per_iteration": 3.6591601371765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111106, + "balance_loss_mlp": 1.0930953, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.083699003973451, + "language_loss": 0.83777452, + "learning_rate": 0.0008946608785231067, + "loss": 0.84888518, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.1796875, + "step": 1217, + "time_per_iteration": 2.910045862197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122887, + "balance_loss_mlp": 1.10500622, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.07421571727754571, + "language_loss": 0.8465637, + "learning_rate": 0.0008944695212477084, + "loss": 0.85779262, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.17871094, + "step": 1218, + "time_per_iteration": 2.524942636489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136819, + "balance_loss_mlp": 1.11900902, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.08988714641466837, + "language_loss": 0.85843921, + "learning_rate": 0.0008942780108256599, + "loss": 0.86980736, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.17822266, + "step": 1219, + "time_per_iteration": 2.638685703277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122459, + "balance_loss_mlp": 1.10441041, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.09147837202786416, + "language_loss": 0.86524791, + "learning_rate": 0.0008940863473313121, + "loss": 0.87647247, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.18054199, + "step": 1220, + "time_per_iteration": 2.5017247200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141789, + "balance_loss_mlp": 1.12406206, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.08221984397196716, + "language_loss": 0.87834692, + "learning_rate": 0.0008938945308390756, + "loss": 0.88976479, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.17724609, + "step": 1221, + "time_per_iteration": 2.663565158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.1284095, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.07596551545175816, + "language_loss": 0.86929715, + "learning_rate": 0.00089370256142342, + "loss": 0.88075024, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.16918945, + "step": 1222, + "time_per_iteration": 2.7675375938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143, + "balance_loss_mlp": 1.12577403, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.07111090095827391, + "language_loss": 0.84719163, + "learning_rate": 0.0008935104391588746, + "loss": 0.8586216, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.17248535, + "step": 1223, + "time_per_iteration": 2.7930641174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141088, + "balance_loss_mlp": 1.12308729, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.09172313762061536, + "language_loss": 0.83210915, + "learning_rate": 0.0008933181641200276, + "loss": 0.84352005, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.18005371, + "step": 1224, + "time_per_iteration": 3.184723138809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113861, + "balance_loss_mlp": 1.1213243, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.08544958772393396, + "language_loss": 0.85490656, + "learning_rate": 0.0008931257363815271, + "loss": 0.86629266, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.1730957, + "step": 1225, + "time_per_iteration": 2.9049925804138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116947, + "balance_loss_mlp": 1.09978044, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.08572157059192624, + "language_loss": 0.8983537, + "learning_rate": 0.0008929331560180798, + "loss": 0.90952325, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.171875, + "step": 1226, + "time_per_iteration": 2.976716995239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119433, + "balance_loss_mlp": 1.10198092, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.07629670414533757, + "language_loss": 0.90995669, + "learning_rate": 0.0008927404231044525, + "loss": 0.92115104, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.17468262, + "step": 1227, + "time_per_iteration": 2.754908561706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103828, + "balance_loss_mlp": 1.08611393, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.07882349010207228, + "language_loss": 0.81471217, + "learning_rate": 0.0008925475377154703, + "loss": 0.82575047, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.17736816, + "step": 1228, + "time_per_iteration": 2.7809646129608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100869, + "balance_loss_mlp": 1.08254623, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.07142925877548961, + "language_loss": 0.82040304, + "learning_rate": 0.0008923544999260183, + "loss": 0.83141172, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.18322754, + "step": 1229, + "time_per_iteration": 2.760239362716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110144, + "balance_loss_mlp": 1.09266782, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.12387153159230253, + "language_loss": 0.91337013, + "learning_rate": 0.00089216130981104, + "loss": 0.92447156, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.17480469, + "step": 1230, + "time_per_iteration": 3.121588945388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110904, + "balance_loss_mlp": 1.09090781, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.07661504881361146, + "language_loss": 0.82228827, + "learning_rate": 0.000891967967445539, + "loss": 0.83337867, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.18139648, + "step": 1231, + "time_per_iteration": 2.7672059535980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109921, + "balance_loss_mlp": 1.0920639, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.054732650189263314, + "language_loss": 0.88646662, + "learning_rate": 0.0008917744729045772, + "loss": 0.89756578, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.17871094, + "step": 1232, + "time_per_iteration": 2.9028637409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104934, + "balance_loss_mlp": 1.08743405, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.08391850168433768, + "language_loss": 0.83650339, + "learning_rate": 0.0008915808262632757, + "loss": 0.84755272, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.1751709, + "step": 1233, + "time_per_iteration": 2.870555877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123449, + "balance_loss_mlp": 1.10509062, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.09539034143195195, + "language_loss": 0.92907977, + "learning_rate": 0.0008913870275968148, + "loss": 0.94031429, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.18359375, + "step": 1234, + "time_per_iteration": 2.7251648902893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109776, + "balance_loss_mlp": 1.09154916, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.06697050939505883, + "language_loss": 0.87199342, + "learning_rate": 0.0008911930769804342, + "loss": 0.88309121, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.18237305, + "step": 1235, + "time_per_iteration": 3.268287420272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124889, + "balance_loss_mlp": 1.10593486, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.08058060241162714, + "language_loss": 0.91074061, + "learning_rate": 0.0008909989744894318, + "loss": 0.92198944, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.1895752, + "step": 1236, + "time_per_iteration": 2.8918802738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118206, + "balance_loss_mlp": 1.10007429, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.11301283658583765, + "language_loss": 0.81326294, + "learning_rate": 0.0008908047201991649, + "loss": 0.82444501, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.18127441, + "step": 1237, + "time_per_iteration": 2.8053224086761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111628, + "balance_loss_mlp": 1.09433031, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.0928222329851358, + "language_loss": 0.86241579, + "learning_rate": 0.0008906103141850502, + "loss": 0.87353206, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.17321777, + "step": 1238, + "time_per_iteration": 2.90500545501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117636, + "balance_loss_mlp": 1.09980249, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.08449694721293455, + "language_loss": 0.87626004, + "learning_rate": 0.0008904157565225621, + "loss": 0.88743639, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.17834473, + "step": 1239, + "time_per_iteration": 2.687969923019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126339, + "balance_loss_mlp": 1.10839748, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.08713278777958322, + "language_loss": 0.815947, + "learning_rate": 0.000890221047287235, + "loss": 0.82721043, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.17944336, + "step": 1240, + "time_per_iteration": 3.531710386276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134139, + "balance_loss_mlp": 1.11636496, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.07670600064189544, + "language_loss": 0.90527886, + "learning_rate": 0.0008900261865546615, + "loss": 0.91662019, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.17797852, + "step": 1241, + "time_per_iteration": 2.6662704944610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152535, + "balance_loss_mlp": 1.13414097, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.12487758336027797, + "language_loss": 0.84415132, + "learning_rate": 0.0008898311744004936, + "loss": 0.85567665, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.18408203, + "step": 1242, + "time_per_iteration": 2.763388156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149998, + "balance_loss_mlp": 1.13165212, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.06740377455140158, + "language_loss": 0.86921692, + "learning_rate": 0.0008896360109004414, + "loss": 0.88071686, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.18359375, + "step": 1243, + "time_per_iteration": 2.6441633701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140632, + "balance_loss_mlp": 1.12121248, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.09575659644731266, + "language_loss": 0.84275168, + "learning_rate": 0.0008894406961302742, + "loss": 0.85415804, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.1940918, + "step": 1244, + "time_per_iteration": 2.6425938606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112564, + "balance_loss_mlp": 1.10582733, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.07353599262773654, + "language_loss": 0.83287829, + "learning_rate": 0.0008892452301658201, + "loss": 0.84413469, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.19799805, + "step": 1245, + "time_per_iteration": 2.9552412033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105353, + "balance_loss_mlp": 1.08604133, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.06971047839699994, + "language_loss": 0.83254242, + "learning_rate": 0.0008890496130829653, + "loss": 0.84359598, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.19287109, + "step": 1246, + "time_per_iteration": 2.714538812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094484, + "balance_loss_mlp": 1.07490993, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.07160128232814054, + "language_loss": 0.85448045, + "learning_rate": 0.0008888538449576555, + "loss": 0.86542535, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.19567871, + "step": 1247, + "time_per_iteration": 2.5854134559631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081471, + "balance_loss_mlp": 1.06212282, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.10364601092251456, + "language_loss": 0.82938588, + "learning_rate": 0.0008886579258658944, + "loss": 0.84020054, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.1932373, + "step": 1248, + "time_per_iteration": 2.56381893157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085911, + "balance_loss_mlp": 1.06643224, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.11636637674492897, + "language_loss": 0.84617007, + "learning_rate": 0.0008884618558837446, + "loss": 0.8570292, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.19470215, + "step": 1249, + "time_per_iteration": 2.8670427799224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092713, + "balance_loss_mlp": 1.07287669, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.09934462101700196, + "language_loss": 0.86105502, + "learning_rate": 0.0008882656350873273, + "loss": 0.87198216, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.19836426, + "step": 1250, + "time_per_iteration": 2.9198391437530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095988, + "balance_loss_mlp": 1.07702184, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.10386778667644601, + "language_loss": 0.86847913, + "learning_rate": 0.0008880692635528219, + "loss": 0.879439, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.1895752, + "step": 1251, + "time_per_iteration": 3.114600658416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08975875, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.09512533379834028, + "language_loss": 0.89605117, + "learning_rate": 0.0008878727413564669, + "loss": 0.90713388, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.18518066, + "step": 1252, + "time_per_iteration": 2.7784321308135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_mlp": 1.0333159, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.02598255704274824, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81180501, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.11572266, + "step": 1253, + "time_per_iteration": 4.945368528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142164, + "balance_loss_mlp": 1.12338829, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.08359922246859781, + "language_loss": 0.78146553, + "learning_rate": 0.0008874792452834528, + "loss": 0.79288721, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.18774414, + "step": 1254, + "time_per_iteration": 2.765700340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12684703, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.08184252001830684, + "language_loss": 0.87274945, + "learning_rate": 0.0008872822715595626, + "loss": 0.88419414, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.17626953, + "step": 1255, + "time_per_iteration": 2.687319040298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141993, + "balance_loss_mlp": 1.12460077, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.10883062221863066, + "language_loss": 0.86691022, + "learning_rate": 0.0008870851474793598, + "loss": 0.87833017, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.17419434, + "step": 1256, + "time_per_iteration": 2.6231887340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136562, + "balance_loss_mlp": 1.11930037, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.08915320009922777, + "language_loss": 0.89053321, + "learning_rate": 0.0008868878731193752, + "loss": 0.90189886, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.17285156, + "step": 1257, + "time_per_iteration": 2.928931713104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113174, + "balance_loss_mlp": 1.11484766, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.08262742442990392, + "language_loss": 0.89427495, + "learning_rate": 0.0008866904485561973, + "loss": 0.90559232, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.16906738, + "step": 1258, + "time_per_iteration": 2.7494447231292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136898, + "balance_loss_mlp": 1.11986327, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.08559449998713918, + "language_loss": 0.82794583, + "learning_rate": 0.000886492873866473, + "loss": 0.83931482, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.17053223, + "step": 1259, + "time_per_iteration": 2.841770648956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112569, + "balance_loss_mlp": 1.10853612, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.12665734927529698, + "language_loss": 0.8437835, + "learning_rate": 0.000886295149126908, + "loss": 0.85504043, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.17163086, + "step": 1260, + "time_per_iteration": 2.7847495079040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119489, + "balance_loss_mlp": 1.10270476, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.13276121908066757, + "language_loss": 0.85482794, + "learning_rate": 0.0008860972744142655, + "loss": 0.86602283, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.16796875, + "step": 1261, + "time_per_iteration": 2.9415853023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117567, + "balance_loss_mlp": 1.10078192, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.09469206100439348, + "language_loss": 0.81489432, + "learning_rate": 0.0008858992498053671, + "loss": 0.82606995, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.16796875, + "step": 1262, + "time_per_iteration": 2.8460397720336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058087, + "balance_loss_mlp": 1.04578424, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.030096600393216412, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.7764684, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.12304688, + "step": 1263, + "time_per_iteration": 4.891434192657471 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164356, + "balance_loss_mlp": 1.14685583, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.07687362244804527, + "language_loss": 0.83471984, + "learning_rate": 0.0008855027512063817, + "loss": 0.84636343, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.17504883, + "step": 1264, + "time_per_iteration": 2.729905843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188155, + "balance_loss_mlp": 1.17034483, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.10565566639423048, + "language_loss": 0.85338992, + "learning_rate": 0.0008853042773702292, + "loss": 0.86527145, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.17810059, + "step": 1265, + "time_per_iteration": 2.7027270793914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213519, + "balance_loss_mlp": 1.19497013, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.10310511352597752, + "language_loss": 0.87869942, + "learning_rate": 0.0008851056539456896, + "loss": 0.89083463, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.1854248, + "step": 1266, + "time_per_iteration": 2.7062103748321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190822, + "balance_loss_mlp": 1.17235637, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.106198185782814, + "language_loss": 0.81649381, + "learning_rate": 0.0008849068810098755, + "loss": 0.82840204, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.18469238, + "step": 1267, + "time_per_iteration": 3.329357862472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169809, + "balance_loss_mlp": 1.15086627, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.11133940138273103, + "language_loss": 0.82717752, + "learning_rate": 0.0008847079586399575, + "loss": 0.83887565, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.18945312, + "step": 1268, + "time_per_iteration": 2.558319091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131294, + "balance_loss_mlp": 1.11318588, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.08817279245044941, + "language_loss": 0.85679001, + "learning_rate": 0.0008845088869131641, + "loss": 0.86810291, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.18103027, + "step": 1269, + "time_per_iteration": 2.692885637283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122646, + "balance_loss_mlp": 1.10412109, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.07664646159291034, + "language_loss": 0.88602984, + "learning_rate": 0.0008843096659067818, + "loss": 0.89725631, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.18505859, + "step": 1270, + "time_per_iteration": 2.688197374343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117697, + "balance_loss_mlp": 1.09989929, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.06543357243765746, + "language_loss": 0.86065173, + "learning_rate": 0.000884110295698155, + "loss": 0.87182868, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.17822266, + "step": 1271, + "time_per_iteration": 2.9497103691101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113614, + "balance_loss_mlp": 1.09520805, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.10345235518870362, + "language_loss": 0.85674417, + "learning_rate": 0.0008839107763646861, + "loss": 0.86788034, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.18395996, + "step": 1272, + "time_per_iteration": 2.6293063163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111354, + "balance_loss_mlp": 1.09307909, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.0866440520117465, + "language_loss": 0.90339661, + "learning_rate": 0.0008837111079838353, + "loss": 0.91451013, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.18273926, + "step": 1273, + "time_per_iteration": 2.7676210403442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112497, + "balance_loss_mlp": 1.10732698, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.08933257913148762, + "language_loss": 0.89889824, + "learning_rate": 0.000883511290633121, + "loss": 0.91014791, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.17651367, + "step": 1274, + "time_per_iteration": 2.5634043216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111162, + "balance_loss_mlp": 1.09361923, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.08498045219099847, + "language_loss": 0.92045552, + "learning_rate": 0.000883311324390119, + "loss": 0.93157172, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.18005371, + "step": 1275, + "time_per_iteration": 2.688175678253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117687, + "balance_loss_mlp": 1.09850657, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.093400697768974, + "language_loss": 0.81587857, + "learning_rate": 0.0008831112093324629, + "loss": 0.82705545, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.19177246, + "step": 1276, + "time_per_iteration": 3.0782830715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120052, + "balance_loss_mlp": 1.10156226, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.07571489376575821, + "language_loss": 0.88611054, + "learning_rate": 0.0008829109455378444, + "loss": 0.89731109, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.18481445, + "step": 1277, + "time_per_iteration": 2.7325568199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130651, + "balance_loss_mlp": 1.11251891, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.08746979241051268, + "language_loss": 0.86345637, + "learning_rate": 0.000882710533084013, + "loss": 0.87476289, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.18139648, + "step": 1278, + "time_per_iteration": 2.647641658782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113502, + "balance_loss_mlp": 1.11687636, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.0699906863373026, + "language_loss": 0.89239269, + "learning_rate": 0.0008825099720487755, + "loss": 0.90374291, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.18164062, + "step": 1279, + "time_per_iteration": 2.647472858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108592, + "balance_loss_mlp": 1.07490551, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.04364177649541596, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76347059, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.11035156, + "step": 1280, + "time_per_iteration": 4.876530647277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056171, + "balance_loss_mlp": 1.04515576, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.029948837084711404, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79000282, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.11035156, + "step": 1281, + "time_per_iteration": 4.817251205444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130582, + "balance_loss_mlp": 1.11283183, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.0778912228408071, + "language_loss": 0.89449739, + "learning_rate": 0.0008819073982335619, + "loss": 0.9058032, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.17773438, + "step": 1282, + "time_per_iteration": 2.849764823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139737, + "balance_loss_mlp": 1.12209415, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.06136900444292705, + "language_loss": 0.84456879, + "learning_rate": 0.0008817062436519235, + "loss": 0.85596615, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.17651367, + "step": 1283, + "time_per_iteration": 2.662811040878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126818, + "balance_loss_mlp": 1.10860264, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.11946768082571088, + "language_loss": 0.895989, + "learning_rate": 0.0008815049408787788, + "loss": 0.90725714, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.18212891, + "step": 1284, + "time_per_iteration": 2.5498671531677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118472, + "balance_loss_mlp": 1.10030437, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.07911934764568136, + "language_loss": 0.85533321, + "learning_rate": 0.0008813034899922805, + "loss": 0.86651796, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.1817627, + "step": 1285, + "time_per_iteration": 2.546613931655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112528, + "balance_loss_mlp": 1.10687399, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.09325179905503529, + "language_loss": 0.89224762, + "learning_rate": 0.0008811018910706387, + "loss": 0.90350044, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.18395996, + "step": 1286, + "time_per_iteration": 2.5715928077697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124504, + "balance_loss_mlp": 1.10582423, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.08651255320330896, + "language_loss": 0.81603038, + "learning_rate": 0.0008809001441921211, + "loss": 0.82727551, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.18688965, + "step": 1287, + "time_per_iteration": 2.76352858543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116455, + "balance_loss_mlp": 1.09800124, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.07934964537800443, + "language_loss": 0.85291266, + "learning_rate": 0.0008806982494350528, + "loss": 0.86407721, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.18457031, + "step": 1288, + "time_per_iteration": 2.6464178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125382, + "balance_loss_mlp": 1.10674942, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.07889330448691204, + "language_loss": 0.89930373, + "learning_rate": 0.0008804962068778161, + "loss": 0.91055757, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.18615723, + "step": 1289, + "time_per_iteration": 2.8725006580352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123355, + "balance_loss_mlp": 1.10481799, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.09114492679937135, + "language_loss": 0.80640042, + "learning_rate": 0.0008802940165988511, + "loss": 0.81763393, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.18530273, + "step": 1290, + "time_per_iteration": 2.9053151607513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113226, + "balance_loss_mlp": 1.11324596, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.07850606096458997, + "language_loss": 0.88298845, + "learning_rate": 0.000880091678676655, + "loss": 0.89431107, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.18981934, + "step": 1291, + "time_per_iteration": 2.8338379859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115571, + "balance_loss_mlp": 1.09697485, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.0792961220184265, + "language_loss": 0.89043152, + "learning_rate": 0.0008798891931897821, + "loss": 0.90158725, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.18579102, + "step": 1292, + "time_per_iteration": 2.7769196033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121528, + "balance_loss_mlp": 1.10277641, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.0746346978796093, + "language_loss": 0.84222198, + "learning_rate": 0.0008796865602168447, + "loss": 0.8534373, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.18737793, + "step": 1293, + "time_per_iteration": 2.560858964920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115627, + "balance_loss_mlp": 1.09803176, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.06740604853273545, + "language_loss": 0.88270545, + "learning_rate": 0.0008794837798365115, + "loss": 0.89386165, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.17614746, + "step": 1294, + "time_per_iteration": 2.6477129459381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125631, + "balance_loss_mlp": 1.10763049, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.0873145111764115, + "language_loss": 0.88408256, + "learning_rate": 0.0008792808521275089, + "loss": 0.89533883, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.18017578, + "step": 1295, + "time_per_iteration": 2.7224135398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121076, + "balance_loss_mlp": 1.10262191, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.0692696283298791, + "language_loss": 0.87340117, + "learning_rate": 0.0008790777771686206, + "loss": 0.88461185, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.18444824, + "step": 1296, + "time_per_iteration": 2.61126446723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113013, + "balance_loss_mlp": 1.09509635, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.07573373752967896, + "language_loss": 0.84983516, + "learning_rate": 0.0008788745550386872, + "loss": 0.86096525, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.17932129, + "step": 1297, + "time_per_iteration": 2.573880672454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09876418, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.10171762649266601, + "language_loss": 0.797032, + "learning_rate": 0.0008786711858166063, + "loss": 0.80820251, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.18286133, + "step": 1298, + "time_per_iteration": 2.9712767601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123606, + "balance_loss_mlp": 1.10497391, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.0822091876784568, + "language_loss": 0.83161783, + "learning_rate": 0.0008784676695813332, + "loss": 0.8428539, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.1862793, + "step": 1299, + "time_per_iteration": 2.966691017150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129878, + "balance_loss_mlp": 1.11144853, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.08080427389944742, + "language_loss": 0.84450245, + "learning_rate": 0.0008782640064118796, + "loss": 0.85580122, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.18408203, + "step": 1300, + "time_per_iteration": 2.92551589012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240263, + "balance_loss_mlp": 1.22471797, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.06645546985774646, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77425015, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.15527344, + "step": 1301, + "time_per_iteration": 4.9493842124938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114228, + "balance_loss_mlp": 1.12376654, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.09006790660725612, + "language_loss": 0.8623417, + "learning_rate": 0.0008778562395867648, + "loss": 0.87376451, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.18518066, + "step": 1302, + "time_per_iteration": 2.635500907897949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122782, + "balance_loss_mlp": 1.10403061, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.07479626477523657, + "language_loss": 0.83630598, + "learning_rate": 0.0008776521360894127, + "loss": 0.84753382, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.1875, + "step": 1303, + "time_per_iteration": 2.640951156616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090617, + "balance_loss_mlp": 1.07707512, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.0418328343897397, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80052686, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13574219, + "step": 1304, + "time_per_iteration": 4.842891454696655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104198, + "balance_loss_mlp": 1.08618569, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.0798377990367126, + "language_loss": 0.90237606, + "learning_rate": 0.0008772434893213186, + "loss": 0.91341805, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.18017578, + "step": 1305, + "time_per_iteration": 2.6264374256134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097469, + "balance_loss_mlp": 1.07925391, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.07815304176143087, + "language_loss": 0.84344316, + "learning_rate": 0.0008770389462092276, + "loss": 0.85441786, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.18225098, + "step": 1306, + "time_per_iteration": 2.6599185466766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093714, + "balance_loss_mlp": 1.07480729, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.08248282915226902, + "language_loss": 0.86642498, + "learning_rate": 0.0008768342567176357, + "loss": 0.87736213, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.18908691, + "step": 1307, + "time_per_iteration": 2.919123411178589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094095, + "balance_loss_mlp": 1.07524765, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.07892434793160769, + "language_loss": 0.90316761, + "learning_rate": 0.0008766294209260107, + "loss": 0.91410857, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.18859863, + "step": 1308, + "time_per_iteration": 2.703994035720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093703, + "balance_loss_mlp": 1.07496333, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.09325948106778781, + "language_loss": 0.9126637, + "learning_rate": 0.0008764244389138767, + "loss": 0.92360079, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.18725586, + "step": 1309, + "time_per_iteration": 2.6175687313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092261, + "balance_loss_mlp": 1.07365251, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.10626806402083949, + "language_loss": 0.81772095, + "learning_rate": 0.000876219310760815, + "loss": 0.82864356, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.18603516, + "step": 1310, + "time_per_iteration": 2.8659133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097983, + "balance_loss_mlp": 1.07988715, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.13076548306856256, + "language_loss": 0.81004, + "learning_rate": 0.0008760140365464631, + "loss": 0.82101983, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.18103027, + "step": 1311, + "time_per_iteration": 2.646810531616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120372, + "balance_loss_mlp": 1.10276532, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.11580551837549759, + "language_loss": 0.87203217, + "learning_rate": 0.0008758086163505156, + "loss": 0.88323587, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.17626953, + "step": 1312, + "time_per_iteration": 2.601256847381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135664, + "balance_loss_mlp": 1.11779475, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.0666103465279768, + "language_loss": 0.89063561, + "learning_rate": 0.0008756030502527239, + "loss": 0.90199232, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.17883301, + "step": 1313, + "time_per_iteration": 2.8330187797546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161369, + "balance_loss_mlp": 1.14360678, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.0708022330446315, + "language_loss": 0.90153992, + "learning_rate": 0.0008753973383328954, + "loss": 0.91315365, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.17785645, + "step": 1314, + "time_per_iteration": 2.685375928878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011545, + "balance_loss_mlp": 1.13647509, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.08974334028560671, + "language_loss": 0.83722651, + "learning_rate": 0.0008751914806708952, + "loss": 0.84877157, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.18029785, + "step": 1315, + "time_per_iteration": 2.6155343055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164677, + "balance_loss_mlp": 1.14708161, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.08978858583773926, + "language_loss": 0.81837153, + "learning_rate": 0.0008749854773466439, + "loss": 0.83001828, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.17614746, + "step": 1316, + "time_per_iteration": 2.7219769954681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163056, + "balance_loss_mlp": 1.14553261, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.07528804981442601, + "language_loss": 0.8451466, + "learning_rate": 0.0008747793284401192, + "loss": 0.85677719, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.17541504, + "step": 1317, + "time_per_iteration": 2.7144973278045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151322, + "balance_loss_mlp": 1.13359582, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.08898497659473818, + "language_loss": 0.85280555, + "learning_rate": 0.0008745730340313551, + "loss": 0.86431873, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.17736816, + "step": 1318, + "time_per_iteration": 2.7930002212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13595057, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.08370435102239727, + "language_loss": 0.84217906, + "learning_rate": 0.0008743665942004422, + "loss": 0.85371482, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.1763916, + "step": 1319, + "time_per_iteration": 2.68245530128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160638, + "balance_loss_mlp": 1.14311421, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.07392804364708638, + "language_loss": 0.92852235, + "learning_rate": 0.0008741600090275277, + "loss": 0.9401288, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.17529297, + "step": 1320, + "time_per_iteration": 2.5977306365966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163682, + "balance_loss_mlp": 1.14569294, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.10450079995548846, + "language_loss": 0.8392204, + "learning_rate": 0.0008739532785928151, + "loss": 0.8508572, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.17993164, + "step": 1321, + "time_per_iteration": 3.464723587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181344, + "balance_loss_mlp": 1.16827822, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.05258117628035473, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.76074928, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.13085938, + "step": 1322, + "time_per_iteration": 4.845709562301636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194039, + "balance_loss_mlp": 1.17626476, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.080849834949414, + "language_loss": 0.83025825, + "learning_rate": 0.0008735393822590908, + "loss": 0.84219867, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.17785645, + "step": 1323, + "time_per_iteration": 2.7540626525878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204948, + "balance_loss_mlp": 1.18740082, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.08178952973842966, + "language_loss": 0.86670357, + "learning_rate": 0.0008733322165207681, + "loss": 0.87875307, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.17578125, + "step": 1324, + "time_per_iteration": 2.6596570014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203339, + "balance_loss_mlp": 1.18555284, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.08051520692213045, + "language_loss": 0.82727516, + "learning_rate": 0.0008731249058420247, + "loss": 0.8393085, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.17810059, + "step": 1325, + "time_per_iteration": 3.082704782485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197065, + "balance_loss_mlp": 1.17887366, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.07988786822753648, + "language_loss": 0.90256196, + "learning_rate": 0.0008729174503033459, + "loss": 0.9145326, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.18188477, + "step": 1326, + "time_per_iteration": 2.663212299346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163002, + "balance_loss_mlp": 1.14462042, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.09140325585124401, + "language_loss": 0.82217562, + "learning_rate": 0.0008727098499852728, + "loss": 0.83380556, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.18383789, + "step": 1327, + "time_per_iteration": 2.859302520751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114294, + "balance_loss_mlp": 1.12451005, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.07316654776483361, + "language_loss": 0.89623642, + "learning_rate": 0.0008725021049684034, + "loss": 0.90766573, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.18432617, + "step": 1328, + "time_per_iteration": 2.7523410320281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09832358, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.06969820691150284, + "language_loss": 0.82930326, + "learning_rate": 0.000872294215333391, + "loss": 0.84047389, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.18713379, + "step": 1329, + "time_per_iteration": 3.243213415145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.08953917, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.08533282388950945, + "language_loss": 0.82889348, + "learning_rate": 0.0008720861811609457, + "loss": 0.83997935, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.19042969, + "step": 1330, + "time_per_iteration": 2.789504051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086894, + "balance_loss_mlp": 1.06807089, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.08137535215054885, + "language_loss": 0.83645493, + "learning_rate": 0.0008718780025318338, + "loss": 0.84732389, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.18823242, + "step": 1331, + "time_per_iteration": 2.7668251991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092113, + "balance_loss_mlp": 1.07411242, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.08447566633159821, + "language_loss": 0.83860987, + "learning_rate": 0.0008716696795268771, + "loss": 0.84953099, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.18017578, + "step": 1332, + "time_per_iteration": 2.71281099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088022, + "balance_loss_mlp": 1.06994987, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.08355917909814405, + "language_loss": 0.85442013, + "learning_rate": 0.0008714612122269538, + "loss": 0.8653003, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.1809082, + "step": 1333, + "time_per_iteration": 2.9077794551849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108256, + "balance_loss_mlp": 1.09015965, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.09490231540823739, + "language_loss": 0.89133245, + "learning_rate": 0.0008712526007129982, + "loss": 0.90241498, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.18103027, + "step": 1334, + "time_per_iteration": 2.5269079208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127264, + "balance_loss_mlp": 1.10958493, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.09530184614586146, + "language_loss": 0.90164447, + "learning_rate": 0.0008710438450660003, + "loss": 0.91291702, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.17687988, + "step": 1335, + "time_per_iteration": 2.690424680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127744, + "balance_loss_mlp": 1.10994577, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.09938976745138839, + "language_loss": 0.87409496, + "learning_rate": 0.0008708349453670064, + "loss": 0.88537246, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.17810059, + "step": 1336, + "time_per_iteration": 2.5319509506225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128077, + "balance_loss_mlp": 1.10982585, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.08461134195014028, + "language_loss": 0.91159999, + "learning_rate": 0.0008706259016971185, + "loss": 0.92288077, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.18249512, + "step": 1337, + "time_per_iteration": 2.8355276584625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133843, + "balance_loss_mlp": 1.11533022, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.1004001057114973, + "language_loss": 0.82634485, + "learning_rate": 0.0008704167141374944, + "loss": 0.83768326, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.18518066, + "step": 1338, + "time_per_iteration": 2.83562970161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125326, + "balance_loss_mlp": 1.10650253, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.07535387519287148, + "language_loss": 0.87972409, + "learning_rate": 0.0008702073827693482, + "loss": 0.89097726, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.18823242, + "step": 1339, + "time_per_iteration": 2.7440268993377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121252, + "balance_loss_mlp": 1.10240531, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.07907705856450171, + "language_loss": 0.8856355, + "learning_rate": 0.0008699979076739494, + "loss": 0.89684802, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.18847656, + "step": 1340, + "time_per_iteration": 2.985356092453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132949, + "balance_loss_mlp": 1.11369705, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.10358510275764175, + "language_loss": 0.88529009, + "learning_rate": 0.0008697882889326234, + "loss": 0.89661956, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.19238281, + "step": 1341, + "time_per_iteration": 2.564622163772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136453, + "balance_loss_mlp": 1.11695075, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.09783747399550236, + "language_loss": 0.8651613, + "learning_rate": 0.0008695785266267515, + "loss": 0.87652576, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.19482422, + "step": 1342, + "time_per_iteration": 2.7061781883239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147948, + "balance_loss_mlp": 1.12840939, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.08416519118542358, + "language_loss": 0.83111393, + "learning_rate": 0.0008693686208377704, + "loss": 0.84259331, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.19543457, + "step": 1343, + "time_per_iteration": 2.8751444816589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150711, + "balance_loss_mlp": 1.13156581, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.07899493252865974, + "language_loss": 0.88980556, + "learning_rate": 0.0008691585716471733, + "loss": 0.90131271, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.19140625, + "step": 1344, + "time_per_iteration": 2.6969785690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159409, + "balance_loss_mlp": 1.14027607, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.06941419908962602, + "language_loss": 0.8544178, + "learning_rate": 0.0008689483791365079, + "loss": 0.86601192, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.19116211, + "step": 1345, + "time_per_iteration": 2.8562369346618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154974, + "balance_loss_mlp": 1.13669968, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.07286553563097259, + "language_loss": 0.89186096, + "learning_rate": 0.0008687380433873786, + "loss": 0.90341073, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.18273926, + "step": 1346, + "time_per_iteration": 2.7854301929473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173599, + "balance_loss_mlp": 1.15573001, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.11357363401175323, + "language_loss": 0.82125735, + "learning_rate": 0.0008685275644814448, + "loss": 0.83299333, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.17883301, + "step": 1347, + "time_per_iteration": 2.6921608448028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116629, + "balance_loss_mlp": 1.14855206, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07639398633752482, + "language_loss": 0.8419714, + "learning_rate": 0.0008683169425004216, + "loss": 0.85363436, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.17773438, + "step": 1348, + "time_per_iteration": 2.9085500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153795, + "balance_loss_mlp": 1.13597322, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.09519621553180321, + "language_loss": 0.8328886, + "learning_rate": 0.0008681061775260799, + "loss": 0.84442651, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.17834473, + "step": 1349, + "time_per_iteration": 2.8755290508270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143034, + "balance_loss_mlp": 1.12578487, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.10298645875309809, + "language_loss": 0.92206728, + "learning_rate": 0.0008678952696402458, + "loss": 0.93349767, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.17260742, + "step": 1350, + "time_per_iteration": 2.530040740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128339, + "balance_loss_mlp": 1.11113763, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.07054972097096389, + "language_loss": 0.85973078, + "learning_rate": 0.000867684218924801, + "loss": 0.87101424, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.17211914, + "step": 1351, + "time_per_iteration": 2.924776077270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135752, + "balance_loss_mlp": 1.12478447, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.07057525744027235, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80082846, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.10986328, + "step": 1352, + "time_per_iteration": 4.937533378601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127686, + "balance_loss_mlp": 1.11084199, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.06384913215279323, + "language_loss": 0.85261834, + "learning_rate": 0.0008672616893328834, + "loss": 0.86389524, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.1685791, + "step": 1353, + "time_per_iteration": 2.9442062377929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122527, + "balance_loss_mlp": 1.10589719, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.09199225792086613, + "language_loss": 0.90041292, + "learning_rate": 0.0008670502106204512, + "loss": 0.91163814, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.16638184, + "step": 1354, + "time_per_iteration": 2.840792417526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132378, + "balance_loss_mlp": 1.11488962, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.0749682309300763, + "language_loss": 0.81919277, + "learning_rate": 0.0008668385894064892, + "loss": 0.83051658, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.1751709, + "step": 1355, + "time_per_iteration": 2.649226665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150444, + "balance_loss_mlp": 1.13379025, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.10108237113866697, + "language_loss": 0.89089942, + "learning_rate": 0.0008666268257731562, + "loss": 0.90240383, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.16662598, + "step": 1356, + "time_per_iteration": 3.1606926918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_mlp": 1.13520908, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.09285423546908722, + "language_loss": 0.85545158, + "learning_rate": 0.0008664149198026662, + "loss": 0.86697471, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.17126465, + "step": 1357, + "time_per_iteration": 3.286130428314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164462, + "balance_loss_mlp": 1.14699829, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.08517439685870379, + "language_loss": 0.88857412, + "learning_rate": 0.0008662028715772883, + "loss": 0.90021884, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.17480469, + "step": 1358, + "time_per_iteration": 2.6877803802490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157352, + "balance_loss_mlp": 1.13951862, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.08437519054308197, + "language_loss": 0.85356647, + "learning_rate": 0.0008659906811793467, + "loss": 0.86514002, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.1784668, + "step": 1359, + "time_per_iteration": 2.701963186264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152325, + "balance_loss_mlp": 1.13483691, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.09516463994255123, + "language_loss": 0.89262813, + "learning_rate": 0.0008657783486912215, + "loss": 0.90415138, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.17504883, + "step": 1360, + "time_per_iteration": 2.7410097122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150671, + "balance_loss_mlp": 1.1330992, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.06828467212359378, + "language_loss": 0.8976928, + "learning_rate": 0.0008655658741953472, + "loss": 0.90919948, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.17590332, + "step": 1361, + "time_per_iteration": 3.2329330444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138416, + "balance_loss_mlp": 1.12074876, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.06454511059104741, + "language_loss": 0.88249099, + "learning_rate": 0.0008653532577742136, + "loss": 0.89387512, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.17675781, + "step": 1362, + "time_per_iteration": 2.746363401412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139921, + "balance_loss_mlp": 1.12302947, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.07711827630070714, + "language_loss": 0.86794758, + "learning_rate": 0.0008651404995103659, + "loss": 0.87934673, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.16906738, + "step": 1363, + "time_per_iteration": 2.5565500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132814, + "balance_loss_mlp": 1.11538577, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.08155880386034024, + "language_loss": 0.8709327, + "learning_rate": 0.0008649275994864041, + "loss": 0.8822608, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.17431641, + "step": 1364, + "time_per_iteration": 2.716562032699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133153, + "balance_loss_mlp": 1.11586761, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.06672959076804742, + "language_loss": 0.83875144, + "learning_rate": 0.0008647145577849834, + "loss": 0.85008299, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.1730957, + "step": 1365, + "time_per_iteration": 2.8476812839508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129924, + "balance_loss_mlp": 1.11255515, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.0668808093236692, + "language_loss": 0.82936931, + "learning_rate": 0.0008645013744888139, + "loss": 0.8406685, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.17382812, + "step": 1366, + "time_per_iteration": 2.891817092895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127692, + "balance_loss_mlp": 1.11063313, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.08778385712395331, + "language_loss": 0.87274009, + "learning_rate": 0.0008642880496806607, + "loss": 0.88401705, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.17077637, + "step": 1367, + "time_per_iteration": 2.8053958415985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120237, + "balance_loss_mlp": 1.10274851, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.07681621031760291, + "language_loss": 0.84336966, + "learning_rate": 0.0008640745834433437, + "loss": 0.85457206, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.17504883, + "step": 1368, + "time_per_iteration": 2.787339925765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121438, + "balance_loss_mlp": 1.10430789, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.09521927305918056, + "language_loss": 0.86539549, + "learning_rate": 0.000863860975859738, + "loss": 0.87660992, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.17126465, + "step": 1369, + "time_per_iteration": 2.9646191596984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114699, + "balance_loss_mlp": 1.0977838, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.08138719928792186, + "language_loss": 0.87995172, + "learning_rate": 0.0008636472270127733, + "loss": 0.89109874, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.16918945, + "step": 1370, + "time_per_iteration": 2.646869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110661, + "balance_loss_mlp": 1.08878803, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.09119402348134849, + "language_loss": 0.90394557, + "learning_rate": 0.0008634333369854345, + "loss": 0.91501164, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.1784668, + "step": 1371, + "time_per_iteration": 2.630207061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101355, + "balance_loss_mlp": 1.083915, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.08212786438033774, + "language_loss": 0.87634504, + "learning_rate": 0.0008632193058607608, + "loss": 0.88735861, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.17456055, + "step": 1372, + "time_per_iteration": 2.7757019996643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114382, + "balance_loss_mlp": 1.09665525, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.10317877520485044, + "language_loss": 0.80747414, + "learning_rate": 0.0008630051337218466, + "loss": 0.81861794, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.17736816, + "step": 1373, + "time_per_iteration": 2.7459805011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09961104, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.08099527295858751, + "language_loss": 0.82020557, + "learning_rate": 0.0008627908206518409, + "loss": 0.83137608, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.17456055, + "step": 1374, + "time_per_iteration": 2.719428300857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113813, + "balance_loss_mlp": 1.12554145, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.042063102349752246, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76289386, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.12597656, + "step": 1375, + "time_per_iteration": 4.988332748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112187, + "balance_loss_mlp": 1.09442437, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.06812086657274741, + "language_loss": 0.91138768, + "learning_rate": 0.0008623617720514241, + "loss": 0.92250949, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.1776123, + "step": 1376, + "time_per_iteration": 2.644531726837158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109794, + "balance_loss_mlp": 1.09182918, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.0722091181333716, + "language_loss": 0.84490621, + "learning_rate": 0.0008621470366875848, + "loss": 0.85600418, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.1796875, + "step": 1377, + "time_per_iteration": 2.605417490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100816, + "balance_loss_mlp": 1.08375728, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.07263229866332392, + "language_loss": 0.87396085, + "learning_rate": 0.0008619321607257966, + "loss": 0.884969, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.17077637, + "step": 1378, + "time_per_iteration": 2.7229108810424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100855, + "balance_loss_mlp": 1.08392727, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.07341413806820511, + "language_loss": 0.82002622, + "learning_rate": 0.000861717144249482, + "loss": 0.83103478, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.16943359, + "step": 1379, + "time_per_iteration": 2.9031612873077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105487, + "balance_loss_mlp": 1.08884549, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06987190342408907, + "language_loss": 0.89693463, + "learning_rate": 0.0008615019873421175, + "loss": 0.9079895, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.16650391, + "step": 1380, + "time_per_iteration": 2.5554280281066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105329, + "balance_loss_mlp": 1.08804345, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.07960659576711203, + "language_loss": 0.85129094, + "learning_rate": 0.0008612866900872349, + "loss": 0.86234426, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.17297363, + "step": 1381, + "time_per_iteration": 2.560756206512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115387, + "balance_loss_mlp": 1.0986619, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.10185032090542295, + "language_loss": 0.87836969, + "learning_rate": 0.0008610712525684197, + "loss": 0.88952351, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.1673584, + "step": 1382, + "time_per_iteration": 2.649127721786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.09392381, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.09094270381931494, + "language_loss": 0.84048492, + "learning_rate": 0.0008608556748693121, + "loss": 0.85159665, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.17260742, + "step": 1383, + "time_per_iteration": 3.2573940753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109064, + "balance_loss_mlp": 1.09163558, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.0818167871774861, + "language_loss": 0.859007, + "learning_rate": 0.000860639957073607, + "loss": 0.87009764, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.17443848, + "step": 1384, + "time_per_iteration": 2.7120518684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110901, + "balance_loss_mlp": 1.0937109, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.07681443511092155, + "language_loss": 0.87386912, + "learning_rate": 0.0008604240992650534, + "loss": 0.88497818, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.17211914, + "step": 1385, + "time_per_iteration": 2.69921612739563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113027, + "balance_loss_mlp": 1.09546757, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.06494344058238215, + "language_loss": 0.88934892, + "learning_rate": 0.0008602081015274545, + "loss": 0.9004792, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.17553711, + "step": 1386, + "time_per_iteration": 2.7353157997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117717, + "balance_loss_mlp": 1.10092068, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.06900257884101904, + "language_loss": 0.83328801, + "learning_rate": 0.0008599919639446684, + "loss": 0.8444652, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.16809082, + "step": 1387, + "time_per_iteration": 2.6927597522735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110335, + "balance_loss_mlp": 1.09289455, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.08338734757979376, + "language_loss": 0.79947424, + "learning_rate": 0.000859775686600607, + "loss": 0.81057751, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.17468262, + "step": 1388, + "time_per_iteration": 2.5740597248077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123987, + "balance_loss_mlp": 1.10719037, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.09984082638450108, + "language_loss": 0.84917498, + "learning_rate": 0.0008595592695792367, + "loss": 0.86041486, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.16809082, + "step": 1389, + "time_per_iteration": 2.6907854080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112253, + "balance_loss_mlp": 1.10618591, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.06989103866242331, + "language_loss": 0.90147883, + "learning_rate": 0.0008593427129645778, + "loss": 0.91270411, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.16345215, + "step": 1390, + "time_per_iteration": 2.6145434379577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120602, + "balance_loss_mlp": 1.10381722, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.07905482313842922, + "language_loss": 0.85086334, + "learning_rate": 0.0008591260168407052, + "loss": 0.86206937, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.16796875, + "step": 1391, + "time_per_iteration": 2.787076711654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117002, + "balance_loss_mlp": 1.10062313, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.0789554563697551, + "language_loss": 0.8226018, + "learning_rate": 0.0008589091812917479, + "loss": 0.83377182, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.16381836, + "step": 1392, + "time_per_iteration": 2.6753129959106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122652, + "balance_loss_mlp": 1.10604584, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07614476371572584, + "language_loss": 0.84920317, + "learning_rate": 0.0008586922064018887, + "loss": 0.86042964, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.1661377, + "step": 1393, + "time_per_iteration": 2.716813325881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114509, + "balance_loss_mlp": 1.09750938, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.08000570031295028, + "language_loss": 0.89098954, + "learning_rate": 0.0008584750922553651, + "loss": 0.90213466, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.17016602, + "step": 1394, + "time_per_iteration": 3.1575980186462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121389, + "balance_loss_mlp": 1.10477114, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.0683134764251081, + "language_loss": 0.83357704, + "learning_rate": 0.0008582578389364677, + "loss": 0.84479094, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.16625977, + "step": 1395, + "time_per_iteration": 2.885806083679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127129, + "balance_loss_mlp": 1.10989153, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.08737379963197432, + "language_loss": 0.91578317, + "learning_rate": 0.0008580404465295422, + "loss": 0.92705452, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.17260742, + "step": 1396, + "time_per_iteration": 2.849519968032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135341, + "balance_loss_mlp": 1.1180197, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.08461023567525901, + "language_loss": 0.8857668, + "learning_rate": 0.0008578229151189876, + "loss": 0.89712024, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.17321777, + "step": 1397, + "time_per_iteration": 2.94858980178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127453, + "balance_loss_mlp": 1.10984576, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.12493178829468786, + "language_loss": 0.81211323, + "learning_rate": 0.0008576052447892573, + "loss": 0.82338774, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.17614746, + "step": 1398, + "time_per_iteration": 2.534120798110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135254, + "balance_loss_mlp": 1.1178261, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.06803844431236612, + "language_loss": 0.85910499, + "learning_rate": 0.000857387435624858, + "loss": 0.87045753, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.17456055, + "step": 1399, + "time_per_iteration": 2.554008960723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159537, + "balance_loss_mlp": 1.1418941, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.0815296826798993, + "language_loss": 0.87922233, + "learning_rate": 0.0008571694877103513, + "loss": 0.8908177, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.17663574, + "step": 1400, + "time_per_iteration": 3.2941367626190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173169, + "balance_loss_mlp": 1.15442979, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.09384983289618287, + "language_loss": 0.8761692, + "learning_rate": 0.0008569514011303515, + "loss": 0.88790089, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.1875, + "step": 1401, + "time_per_iteration": 2.814588785171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157764, + "balance_loss_mlp": 1.1397872, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.09439986590001768, + "language_loss": 0.87801731, + "learning_rate": 0.0008567331759695277, + "loss": 0.88959491, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.17980957, + "step": 1402, + "time_per_iteration": 2.765251398086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144715, + "balance_loss_mlp": 1.12577283, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.08321050634823257, + "language_loss": 0.85899508, + "learning_rate": 0.0008565148123126023, + "loss": 0.87044227, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.18933105, + "step": 1403, + "time_per_iteration": 2.7100989818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125241, + "balance_loss_mlp": 1.10733557, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.0728098596241797, + "language_loss": 0.86166966, + "learning_rate": 0.0008562963102443516, + "loss": 0.87292206, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.17907715, + "step": 1404, + "time_per_iteration": 2.7286291122436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112919, + "balance_loss_mlp": 1.09493017, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.10158619193030523, + "language_loss": 0.84717911, + "learning_rate": 0.0008560776698496056, + "loss": 0.85830832, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.17993164, + "step": 1405, + "time_per_iteration": 2.9067912101745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103079, + "balance_loss_mlp": 1.08472061, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.08020634125989436, + "language_loss": 0.85596079, + "learning_rate": 0.0008558588912132481, + "loss": 0.86699152, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.18359375, + "step": 1406, + "time_per_iteration": 2.880148410797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071193, + "balance_loss_mlp": 1.05955815, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.03626473669965315, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77530181, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11621094, + "step": 1407, + "time_per_iteration": 4.905766487121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087497, + "balance_loss_mlp": 1.06903148, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.0815781254437323, + "language_loss": 0.82643741, + "learning_rate": 0.0008554209195555016, + "loss": 0.83731234, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.18481445, + "step": 1408, + "time_per_iteration": 2.759427309036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086012, + "balance_loss_mlp": 1.06754613, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.08207637293966, + "language_loss": 0.87980115, + "learning_rate": 0.0008552017267041483, + "loss": 0.89066136, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.18457031, + "step": 1409, + "time_per_iteration": 2.71040678024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_mlp": 1.06865954, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.0734300404961751, + "language_loss": 0.83141303, + "learning_rate": 0.0008549823959512549, + "loss": 0.84229583, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.19616699, + "step": 1410, + "time_per_iteration": 2.6883578300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104836, + "balance_loss_mlp": 1.08663297, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.07342840956593329, + "language_loss": 0.86307788, + "learning_rate": 0.0008547629273819728, + "loss": 0.87412632, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.18212891, + "step": 1411, + "time_per_iteration": 3.4179537296295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110447, + "balance_loss_mlp": 1.09208882, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.07902892919535931, + "language_loss": 0.83264589, + "learning_rate": 0.0008545433210815074, + "loss": 0.84375036, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.18347168, + "step": 1412, + "time_per_iteration": 2.644336462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132524, + "balance_loss_mlp": 1.11396301, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.08239543530107682, + "language_loss": 0.87688351, + "learning_rate": 0.0008543235771351176, + "loss": 0.88820869, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.18554688, + "step": 1413, + "time_per_iteration": 2.7242777347564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140498, + "balance_loss_mlp": 1.12286687, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.06292390757949942, + "language_loss": 0.84580851, + "learning_rate": 0.0008541036956281154, + "loss": 0.85721344, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.17651367, + "step": 1414, + "time_per_iteration": 2.917314052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149081, + "balance_loss_mlp": 1.13212919, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.09608953935856007, + "language_loss": 0.81591362, + "learning_rate": 0.0008538836766458665, + "loss": 0.82740438, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.16967773, + "step": 1415, + "time_per_iteration": 2.8857710361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115948, + "balance_loss_mlp": 1.14234948, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.09141970967130493, + "language_loss": 0.84791577, + "learning_rate": 0.0008536635202737897, + "loss": 0.85951054, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.17150879, + "step": 1416, + "time_per_iteration": 2.8404181003570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168227, + "balance_loss_mlp": 1.15094137, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.08934509912200893, + "language_loss": 0.81624401, + "learning_rate": 0.0008534432265973573, + "loss": 0.82792622, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.1730957, + "step": 1417, + "time_per_iteration": 2.636125326156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117943, + "balance_loss_mlp": 1.16220391, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.09636198633360953, + "language_loss": 0.87909538, + "learning_rate": 0.000853222795702095, + "loss": 0.89088964, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.17248535, + "step": 1418, + "time_per_iteration": 3.452954053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168866, + "balance_loss_mlp": 1.15174711, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.09586408952292569, + "language_loss": 0.83810413, + "learning_rate": 0.0008530022276735813, + "loss": 0.84979284, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.17138672, + "step": 1419, + "time_per_iteration": 2.74656081199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160265, + "balance_loss_mlp": 1.14302731, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.07361815357739941, + "language_loss": 0.8564744, + "learning_rate": 0.0008527815225974489, + "loss": 0.86807704, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.17260742, + "step": 1420, + "time_per_iteration": 2.6620352268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.14375329, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10060729288286506, + "language_loss": 0.88312179, + "learning_rate": 0.0008525606805593829, + "loss": 0.89473552, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.17651367, + "step": 1421, + "time_per_iteration": 2.4528608322143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152179, + "balance_loss_mlp": 1.13429809, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.0906337737142573, + "language_loss": 0.82765526, + "learning_rate": 0.0008523397016451213, + "loss": 0.83917701, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.17895508, + "step": 1422, + "time_per_iteration": 2.611370086669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146443, + "balance_loss_mlp": 1.12862146, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.0675988615568281, + "language_loss": 0.86714458, + "learning_rate": 0.0008521185859404564, + "loss": 0.87860906, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.17822266, + "step": 1423, + "time_per_iteration": 3.4147353172302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127875, + "balance_loss_mlp": 1.11027932, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.10391013903512737, + "language_loss": 0.89233863, + "learning_rate": 0.0008518973335312326, + "loss": 0.90361738, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.17602539, + "step": 1424, + "time_per_iteration": 2.8380019664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131922, + "balance_loss_mlp": 1.11418414, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.08776572848910039, + "language_loss": 0.83471692, + "learning_rate": 0.0008516759445033477, + "loss": 0.8460362, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.1776123, + "step": 1425, + "time_per_iteration": 2.6492245197296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148521, + "balance_loss_mlp": 1.13083041, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.09331893476455168, + "language_loss": 0.84960282, + "learning_rate": 0.0008514544189427526, + "loss": 0.86108804, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.17687988, + "step": 1426, + "time_per_iteration": 2.694824457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160501, + "balance_loss_mlp": 1.14289403, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.10058930784889258, + "language_loss": 0.86324757, + "learning_rate": 0.0008512327569354511, + "loss": 0.8748526, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.17602539, + "step": 1427, + "time_per_iteration": 2.5711381435394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170402, + "balance_loss_mlp": 1.15265131, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.08313733600620697, + "language_loss": 0.83505958, + "learning_rate": 0.0008510109585675001, + "loss": 0.84676361, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.17749023, + "step": 1428, + "time_per_iteration": 2.6291069984436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075627, + "balance_loss_mlp": 1.06465936, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.04529042076604016, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82228971, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.10986328, + "step": 1429, + "time_per_iteration": 4.732970952987671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151608, + "balance_loss_mlp": 1.13460922, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.10649873504882197, + "language_loss": 0.80186272, + "learning_rate": 0.0008505669530941415, + "loss": 0.81337881, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.17016602, + "step": 1430, + "time_per_iteration": 3.3425114154815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132102, + "balance_loss_mlp": 1.11454248, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.09668389067503143, + "language_loss": 0.83789647, + "learning_rate": 0.000850344746161112, + "loss": 0.84921753, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.17578125, + "step": 1431, + "time_per_iteration": 2.6212620735168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115216, + "balance_loss_mlp": 1.09790659, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.07650346740070771, + "language_loss": 0.87718683, + "learning_rate": 0.0008501224032121894, + "loss": 0.88833898, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.17321777, + "step": 1432, + "time_per_iteration": 2.531632900238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099408, + "balance_loss_mlp": 1.0818007, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.07599019403635421, + "language_loss": 0.81644619, + "learning_rate": 0.0008498999243336946, + "loss": 0.82744026, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.17626953, + "step": 1433, + "time_per_iteration": 2.6577858924865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108116, + "balance_loss_mlp": 1.09086609, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.08691171830183525, + "language_loss": 0.87290454, + "learning_rate": 0.0008496773096120021, + "loss": 0.8839857, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.17260742, + "step": 1434, + "time_per_iteration": 2.8218367099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103536, + "balance_loss_mlp": 1.08573806, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.09853984157164923, + "language_loss": 0.83996856, + "learning_rate": 0.0008494545591335381, + "loss": 0.85100389, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.17810059, + "step": 1435, + "time_per_iteration": 2.9297800064086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114552, + "balance_loss_mlp": 1.09671807, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.06137328591569865, + "language_loss": 0.86751276, + "learning_rate": 0.0008492316729847823, + "loss": 0.87865829, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.1784668, + "step": 1436, + "time_per_iteration": 2.8056235313415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111542, + "balance_loss_mlp": 1.09787273, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08045565015071575, + "language_loss": 0.79808342, + "learning_rate": 0.0008490086512522664, + "loss": 0.8092376, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.17565918, + "step": 1437, + "time_per_iteration": 2.7486345767974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125236, + "balance_loss_mlp": 1.10653245, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.07152243392964944, + "language_loss": 0.90246308, + "learning_rate": 0.0008487854940225755, + "loss": 0.91371536, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.18701172, + "step": 1438, + "time_per_iteration": 2.45500111579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119071, + "balance_loss_mlp": 1.10104609, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.12336147646099646, + "language_loss": 0.89520633, + "learning_rate": 0.0008485622013823466, + "loss": 0.9063971, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.18029785, + "step": 1439, + "time_per_iteration": 2.6394927501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116899, + "balance_loss_mlp": 1.09899366, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.08970889576331396, + "language_loss": 0.83229852, + "learning_rate": 0.00084833877341827, + "loss": 0.84346747, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.17895508, + "step": 1440, + "time_per_iteration": 2.673386812210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137485, + "balance_loss_mlp": 1.11953235, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.09818503582677594, + "language_loss": 0.8055383, + "learning_rate": 0.000848115210217088, + "loss": 0.81691313, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.17956543, + "step": 1441, + "time_per_iteration": 2.6129040718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143053, + "balance_loss_mlp": 1.12554169, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.08082573862086316, + "language_loss": 0.81372535, + "learning_rate": 0.0008478915118655952, + "loss": 0.82515597, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.17529297, + "step": 1442, + "time_per_iteration": 2.843041181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150917, + "balance_loss_mlp": 1.13371468, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.07560665817061937, + "language_loss": 0.86043841, + "learning_rate": 0.0008476676784506393, + "loss": 0.87194753, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.17224121, + "step": 1443, + "time_per_iteration": 2.669281005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145766, + "balance_loss_mlp": 1.12862349, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.07357545068984293, + "language_loss": 0.81809199, + "learning_rate": 0.0008474437100591201, + "loss": 0.82954967, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.17150879, + "step": 1444, + "time_per_iteration": 3.32959246635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112957, + "balance_loss_mlp": 1.1127255, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.08256066258120752, + "language_loss": 0.85183853, + "learning_rate": 0.0008472196067779898, + "loss": 0.86313421, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.1685791, + "step": 1445, + "time_per_iteration": 2.6932947635650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.09586096, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.1350534130118882, + "language_loss": 0.85003686, + "learning_rate": 0.0008469953686942531, + "loss": 0.86116487, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.16955566, + "step": 1446, + "time_per_iteration": 3.0903265476226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122325, + "balance_loss_mlp": 1.10539699, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.09027465145753444, + "language_loss": 0.82766867, + "learning_rate": 0.0008467709958949668, + "loss": 0.83889192, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.16943359, + "step": 1447, + "time_per_iteration": 2.7486042976379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.1059382, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.08057262764159107, + "language_loss": 0.85942835, + "learning_rate": 0.0008465464884672403, + "loss": 0.87065423, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.16662598, + "step": 1448, + "time_per_iteration": 2.7239129543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128319, + "balance_loss_mlp": 1.11145079, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.0722544104008292, + "language_loss": 0.85391676, + "learning_rate": 0.0008463218464982348, + "loss": 0.86520004, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.16882324, + "step": 1449, + "time_per_iteration": 2.824716329574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112804, + "balance_loss_mlp": 1.11102891, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.07814645269371487, + "language_loss": 0.8771199, + "learning_rate": 0.0008460970700751645, + "loss": 0.88840032, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.17016602, + "step": 1450, + "time_per_iteration": 3.1141586303710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126615, + "balance_loss_mlp": 1.10931802, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.07255444133213705, + "language_loss": 0.87776339, + "learning_rate": 0.000845872159285295, + "loss": 0.8890295, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.1730957, + "step": 1451, + "time_per_iteration": 2.739476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085209, + "balance_loss_mlp": 1.07529104, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.033234239085754465, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78852057, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.09912109, + "step": 1452, + "time_per_iteration": 4.910952806472778 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138861, + "balance_loss_mlp": 1.12121844, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.10385775803237589, + "language_loss": 0.86136031, + "learning_rate": 0.0008454219349544836, + "loss": 0.87274891, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.17651367, + "step": 1453, + "time_per_iteration": 3.3671629428863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121876, + "balance_loss_mlp": 1.10430491, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.07125574209855656, + "language_loss": 0.82064086, + "learning_rate": 0.000845196621588334, + "loss": 0.83185959, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.17602539, + "step": 1454, + "time_per_iteration": 2.775218963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125012, + "balance_loss_mlp": 1.107584, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.07195305251583452, + "language_loss": 0.7580061, + "learning_rate": 0.0008449711742049706, + "loss": 0.76925623, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.17443848, + "step": 1455, + "time_per_iteration": 2.785322427749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120523, + "balance_loss_mlp": 1.10295129, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.08382647519260926, + "language_loss": 0.83480191, + "learning_rate": 0.0008447455928919196, + "loss": 0.84600711, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.17590332, + "step": 1456, + "time_per_iteration": 2.660736083984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119281, + "balance_loss_mlp": 1.10179305, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.0678890613230097, + "language_loss": 0.86596936, + "learning_rate": 0.0008445198777367595, + "loss": 0.87716216, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.17492676, + "step": 1457, + "time_per_iteration": 2.5753204822540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121467, + "balance_loss_mlp": 1.10389531, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.10986912551565038, + "language_loss": 0.80972993, + "learning_rate": 0.0008442940288271208, + "loss": 0.82094461, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.17578125, + "step": 1458, + "time_per_iteration": 2.641165018081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112066, + "balance_loss_mlp": 1.10273051, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06853525506838967, + "language_loss": 0.86948931, + "learning_rate": 0.0008440680462506856, + "loss": 0.88069594, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.17932129, + "step": 1459, + "time_per_iteration": 2.7613425254821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.09818411, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.07519938139917645, + "language_loss": 0.86463004, + "learning_rate": 0.0008438419300951883, + "loss": 0.87578404, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.17224121, + "step": 1460, + "time_per_iteration": 2.684657335281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_mlp": 1.09928942, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.0687143759737579, + "language_loss": 0.86178434, + "learning_rate": 0.0008436156804484148, + "loss": 0.8729527, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.17565918, + "step": 1461, + "time_per_iteration": 2.860818386077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111314, + "balance_loss_mlp": 1.09343266, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.11710518654826144, + "language_loss": 0.88180649, + "learning_rate": 0.0008433892973982031, + "loss": 0.89291972, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.17883301, + "step": 1462, + "time_per_iteration": 2.58311128616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.08844042, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07819154550189573, + "language_loss": 0.84951186, + "learning_rate": 0.0008431627810324431, + "loss": 0.86057329, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.17724609, + "step": 1463, + "time_per_iteration": 2.6800074577331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111406, + "balance_loss_mlp": 1.09443069, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.06467590099086191, + "language_loss": 0.81057346, + "learning_rate": 0.000842936131439076, + "loss": 0.82168752, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.16992188, + "step": 1464, + "time_per_iteration": 2.6747214794158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111707, + "balance_loss_mlp": 1.09463668, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.06943840277913271, + "language_loss": 0.87714398, + "learning_rate": 0.0008427093487060951, + "loss": 0.88826108, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.17089844, + "step": 1465, + "time_per_iteration": 2.6723203659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113512, + "balance_loss_mlp": 1.09656, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.06709163317621891, + "language_loss": 0.846192, + "learning_rate": 0.000842482432921545, + "loss": 0.8573271, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.16955566, + "step": 1466, + "time_per_iteration": 2.8659911155700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0876503, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.07868097185173097, + "language_loss": 0.86230814, + "learning_rate": 0.0008422553841735225, + "loss": 0.87335783, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.17333984, + "step": 1467, + "time_per_iteration": 2.5069150924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109046, + "balance_loss_mlp": 1.09167767, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.07514750891429747, + "language_loss": 0.84737515, + "learning_rate": 0.0008420282025501757, + "loss": 0.85846567, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.17370605, + "step": 1468, + "time_per_iteration": 2.808751344680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094588, + "balance_loss_mlp": 1.07768393, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.0683968152950732, + "language_loss": 0.84884882, + "learning_rate": 0.0008418008881397043, + "loss": 0.85979474, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.16918945, + "step": 1469, + "time_per_iteration": 2.6929962635040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089051, + "balance_loss_mlp": 1.07267165, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.0720569823253329, + "language_loss": 0.82694614, + "learning_rate": 0.0008415734410303595, + "loss": 0.83783662, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.16381836, + "step": 1470, + "time_per_iteration": 3.2501566410064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095144, + "balance_loss_mlp": 1.07776332, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.07334017240809462, + "language_loss": 0.90763617, + "learning_rate": 0.0008413458613104444, + "loss": 0.91858757, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.17407227, + "step": 1471, + "time_per_iteration": 2.7336316108703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089358, + "balance_loss_mlp": 1.07198906, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.06835636483746928, + "language_loss": 0.82895148, + "learning_rate": 0.0008411181490683129, + "loss": 0.839845, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.17370605, + "step": 1472, + "time_per_iteration": 2.742314100265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085557, + "balance_loss_mlp": 1.0680809, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.08020623974692119, + "language_loss": 0.82316583, + "learning_rate": 0.0008408903043923707, + "loss": 0.83402139, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.17492676, + "step": 1473, + "time_per_iteration": 3.0307655334472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090161, + "balance_loss_mlp": 1.07230377, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09874308222598177, + "language_loss": 0.81175971, + "learning_rate": 0.0008406623273710754, + "loss": 0.8226614, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.17858887, + "step": 1474, + "time_per_iteration": 2.6652164459228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086824, + "balance_loss_mlp": 1.06919324, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.0806852987114514, + "language_loss": 0.82865691, + "learning_rate": 0.0008404342180929351, + "loss": 0.83952522, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.1763916, + "step": 1475, + "time_per_iteration": 2.676020622253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.06739831, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.0807623151432505, + "language_loss": 0.81497931, + "learning_rate": 0.00084020597664651, + "loss": 0.82583237, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.17907715, + "step": 1476, + "time_per_iteration": 2.8055877685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087711, + "balance_loss_mlp": 1.06957936, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.09698913749719028, + "language_loss": 0.83786356, + "learning_rate": 0.0008399776031204111, + "loss": 0.8487407, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.18139648, + "step": 1477, + "time_per_iteration": 2.7545149326324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087702, + "balance_loss_mlp": 1.06898642, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.09010893322506078, + "language_loss": 0.7971096, + "learning_rate": 0.0008397490976033009, + "loss": 0.80798662, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.18713379, + "step": 1478, + "time_per_iteration": 2.654254198074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107225, + "balance_loss_mlp": 1.06009066, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.04001675887347635, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78951895, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.12158203, + "step": 1479, + "time_per_iteration": 4.77993631362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088996, + "balance_loss_mlp": 1.07022035, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.07008895147668387, + "language_loss": 0.84977293, + "learning_rate": 0.0008392916909509525, + "loss": 0.86066294, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.18762207, + "step": 1480, + "time_per_iteration": 3.103787422180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110182, + "balance_loss_mlp": 1.08308077, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.07686502510285433, + "language_loss": 0.8518846, + "learning_rate": 0.0008390627899932954, + "loss": 0.86290276, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.18737793, + "step": 1481, + "time_per_iteration": 2.6177799701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113908, + "balance_loss_mlp": 1.09524012, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.10214098417508043, + "language_loss": 0.88852942, + "learning_rate": 0.000838833757399789, + "loss": 0.89966846, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.18664551, + "step": 1482, + "time_per_iteration": 2.9566540718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114933, + "balance_loss_mlp": 1.09678972, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.08257095939450843, + "language_loss": 0.80571115, + "learning_rate": 0.0008386045932593515, + "loss": 0.81686044, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.18139648, + "step": 1483, + "time_per_iteration": 2.717756509780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109957, + "balance_loss_mlp": 1.09277904, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.07262082200825942, + "language_loss": 0.86045611, + "learning_rate": 0.0008383752976609525, + "loss": 0.87155575, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.171875, + "step": 1484, + "time_per_iteration": 2.950330972671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113011, + "balance_loss_mlp": 1.09571338, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06349274760065945, + "language_loss": 0.7998122, + "learning_rate": 0.0008381458706936123, + "loss": 0.81094229, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.17321777, + "step": 1485, + "time_per_iteration": 2.750422239303589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105744, + "balance_loss_mlp": 1.08867359, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.08725606785490185, + "language_loss": 0.87347835, + "learning_rate": 0.0008379163124464025, + "loss": 0.88453579, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.17089844, + "step": 1486, + "time_per_iteration": 2.8127403259277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108247, + "balance_loss_mlp": 1.09145021, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.08194161324991753, + "language_loss": 0.7704097, + "learning_rate": 0.0008376866230084452, + "loss": 0.78149223, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.16809082, + "step": 1487, + "time_per_iteration": 2.8382246494293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.08535266, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.07305349361660647, + "language_loss": 0.85623455, + "learning_rate": 0.000837456802468914, + "loss": 0.8672576, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.16967773, + "step": 1488, + "time_per_iteration": 2.619359016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101386, + "balance_loss_mlp": 1.08414829, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.08101706440693511, + "language_loss": 0.85233498, + "learning_rate": 0.0008372268509170331, + "loss": 0.86334878, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.17248535, + "step": 1489, + "time_per_iteration": 2.735579252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_mlp": 1.08728886, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.09066736504037358, + "language_loss": 0.84989464, + "learning_rate": 0.0008369967684420779, + "loss": 0.86093777, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.17041016, + "step": 1490, + "time_per_iteration": 2.7550840377807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099327, + "balance_loss_mlp": 1.08251846, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.11208283725325253, + "language_loss": 0.84236765, + "learning_rate": 0.0008367665551333736, + "loss": 0.85336089, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.16821289, + "step": 1491, + "time_per_iteration": 2.6229591369628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_mlp": 1.10114861, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.08256436767566132, + "language_loss": 0.85062146, + "learning_rate": 0.0008365362110802977, + "loss": 0.86180484, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.17211914, + "step": 1492, + "time_per_iteration": 2.871260166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139482, + "balance_loss_mlp": 1.12254202, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.14712707580735673, + "language_loss": 0.82232606, + "learning_rate": 0.0008363057363722773, + "loss": 0.83372086, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.16955566, + "step": 1493, + "time_per_iteration": 2.8748109340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156529, + "balance_loss_mlp": 1.14010167, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.10196458183452421, + "language_loss": 0.84016562, + "learning_rate": 0.0008360751310987906, + "loss": 0.85173088, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.16430664, + "step": 1494, + "time_per_iteration": 2.6634154319763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156202, + "balance_loss_mlp": 1.13989449, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.07806891614800103, + "language_loss": 0.85166085, + "learning_rate": 0.0008358443953493666, + "loss": 0.8632229, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.16308594, + "step": 1495, + "time_per_iteration": 2.875852584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161766, + "balance_loss_mlp": 1.1449573, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.11619662908019952, + "language_loss": 0.88208884, + "learning_rate": 0.0008356135292135851, + "loss": 0.89370644, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.16821289, + "step": 1496, + "time_per_iteration": 2.5129776000976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129263, + "balance_loss_mlp": 1.11256182, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.0960393188024377, + "language_loss": 0.91794455, + "learning_rate": 0.0008353825327810758, + "loss": 0.92923725, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.16711426, + "step": 1497, + "time_per_iteration": 2.437619686126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109969, + "balance_loss_mlp": 1.09312487, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.09345990074491838, + "language_loss": 0.81679749, + "learning_rate": 0.00083515140614152, + "loss": 0.82789719, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.1685791, + "step": 1498, + "time_per_iteration": 2.7478325366973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119738, + "balance_loss_mlp": 1.10310864, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.10003726096036522, + "language_loss": 0.868577, + "learning_rate": 0.0008349201493846485, + "loss": 0.87977445, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.16625977, + "step": 1499, + "time_per_iteration": 2.639324188232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_mlp": 1.09971237, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.07951211502216154, + "language_loss": 0.89032578, + "learning_rate": 0.0008346887626002432, + "loss": 0.90149426, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.17150879, + "step": 1500, + "time_per_iteration": 2.542311668395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120306, + "balance_loss_mlp": 1.10360527, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.0665017309713035, + "language_loss": 0.85912937, + "learning_rate": 0.000834457245878137, + "loss": 0.87033248, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.16711426, + "step": 1501, + "time_per_iteration": 2.639570951461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122516, + "balance_loss_mlp": 1.10619664, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.07589763823888349, + "language_loss": 0.80857193, + "learning_rate": 0.000834225599308212, + "loss": 0.81979704, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.16320801, + "step": 1502, + "time_per_iteration": 3.2867560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113369, + "balance_loss_mlp": 1.11684537, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.09000351929941647, + "language_loss": 0.84986663, + "learning_rate": 0.0008339938229804016, + "loss": 0.86120355, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.1685791, + "step": 1503, + "time_per_iteration": 2.7262394428253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167456, + "balance_loss_mlp": 1.15496254, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.04837114619258858, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.7660228, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.12451172, + "step": 1504, + "time_per_iteration": 4.9622483253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129895, + "balance_loss_mlp": 1.11289549, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.1124140207378676, + "language_loss": 0.83872616, + "learning_rate": 0.0008335298814111094, + "loss": 0.85002512, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.17016602, + "step": 1505, + "time_per_iteration": 2.6357829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133506, + "balance_loss_mlp": 1.11616087, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.09211411957598506, + "language_loss": 0.87906271, + "learning_rate": 0.0008332977163497455, + "loss": 0.89039779, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.17370605, + "step": 1506, + "time_per_iteration": 2.798208475112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123584, + "balance_loss_mlp": 1.10653734, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.07286788522172229, + "language_loss": 0.83603442, + "learning_rate": 0.0008330654218907325, + "loss": 0.84727025, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.1706543, + "step": 1507, + "time_per_iteration": 2.708980083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112441, + "balance_loss_mlp": 1.09509647, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.06462764814837715, + "language_loss": 0.8140111, + "learning_rate": 0.0008328329981242548, + "loss": 0.82513553, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.17358398, + "step": 1508, + "time_per_iteration": 2.894169330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110509, + "balance_loss_mlp": 1.08767331, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.08188322832397743, + "language_loss": 0.87448251, + "learning_rate": 0.0008326004451405475, + "loss": 0.88553333, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.17443848, + "step": 1509, + "time_per_iteration": 2.8026657104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092866, + "balance_loss_mlp": 1.07596231, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.07862145855051805, + "language_loss": 0.81981707, + "learning_rate": 0.0008323677630298957, + "loss": 0.8307457, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.16918945, + "step": 1510, + "time_per_iteration": 2.6314613819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109107, + "balance_loss_mlp": 1.07407045, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.06795291351042136, + "language_loss": 0.84809089, + "learning_rate": 0.0008321349518826345, + "loss": 0.85900158, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.17016602, + "step": 1511, + "time_per_iteration": 2.8404459953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086604, + "balance_loss_mlp": 1.06950927, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.11455853074779208, + "language_loss": 0.95139891, + "learning_rate": 0.0008319020117891491, + "loss": 0.96226501, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.17102051, + "step": 1512, + "time_per_iteration": 2.6767001152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_mlp": 1.06650186, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.0847466939070868, + "language_loss": 0.86754417, + "learning_rate": 0.0008316689428398751, + "loss": 0.87838477, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.17565918, + "step": 1513, + "time_per_iteration": 2.7069385051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079727, + "balance_loss_mlp": 1.06318033, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.1225209310639027, + "language_loss": 0.88519126, + "learning_rate": 0.0008314357451252979, + "loss": 0.89598852, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.16552734, + "step": 1514, + "time_per_iteration": 2.8014771938323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088545, + "balance_loss_mlp": 1.07215357, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.09390151153588368, + "language_loss": 0.87912899, + "learning_rate": 0.0008312024187359527, + "loss": 0.89001441, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.16394043, + "step": 1515, + "time_per_iteration": 2.646131992340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.07367659, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.0632997915526053, + "language_loss": 0.87038326, + "learning_rate": 0.000830968963762425, + "loss": 0.88128293, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.1628418, + "step": 1516, + "time_per_iteration": 3.0603909492492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_mlp": 1.08745098, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.08225160647217689, + "language_loss": 0.83996677, + "learning_rate": 0.0008307353802953497, + "loss": 0.85100901, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.16784668, + "step": 1517, + "time_per_iteration": 2.7085869312286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105905, + "balance_loss_mlp": 1.08885777, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.07719324020211826, + "language_loss": 0.85852122, + "learning_rate": 0.0008305016684254125, + "loss": 0.86958027, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.17053223, + "step": 1518, + "time_per_iteration": 2.843050241470337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.0979718, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.07921278172023684, + "language_loss": 0.86861145, + "learning_rate": 0.0008302678282433479, + "loss": 0.87976027, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.16918945, + "step": 1519, + "time_per_iteration": 2.605964422225952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122329, + "balance_loss_mlp": 1.10534143, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07975311040882123, + "language_loss": 0.84663725, + "learning_rate": 0.0008300338598399411, + "loss": 0.85786051, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.17004395, + "step": 1520, + "time_per_iteration": 2.6344962120056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128968, + "balance_loss_mlp": 1.11150408, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.07139673380832469, + "language_loss": 0.9444648, + "learning_rate": 0.0008297997633060263, + "loss": 0.95575452, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.17480469, + "step": 1521, + "time_per_iteration": 2.5109918117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123567, + "balance_loss_mlp": 1.10605538, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07755113838475138, + "language_loss": 0.84917367, + "learning_rate": 0.0008295655387324883, + "loss": 0.86040938, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.17529297, + "step": 1522, + "time_per_iteration": 2.8314778804779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132674, + "balance_loss_mlp": 1.11578202, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.08909358029202981, + "language_loss": 0.84779286, + "learning_rate": 0.0008293311862102609, + "loss": 0.85911965, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.16894531, + "step": 1523, + "time_per_iteration": 2.5455641746520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112826, + "balance_loss_mlp": 1.11147499, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.07268877656623862, + "language_loss": 0.88628173, + "learning_rate": 0.0008290967058303275, + "loss": 0.89756435, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.16796875, + "step": 1524, + "time_per_iteration": 2.5151915550231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114491, + "balance_loss_mlp": 1.1288048, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07556317822889831, + "language_loss": 0.86503643, + "learning_rate": 0.0008288620976837219, + "loss": 0.87648547, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.16101074, + "step": 1525, + "time_per_iteration": 2.526381731033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145799, + "balance_loss_mlp": 1.12897861, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.07322803654736391, + "language_loss": 0.826621, + "learning_rate": 0.000828627361861527, + "loss": 0.83807898, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.16833496, + "step": 1526, + "time_per_iteration": 2.629249334335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146511, + "balance_loss_mlp": 1.13019073, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.08423530938833095, + "language_loss": 0.84572363, + "learning_rate": 0.0008283924984548752, + "loss": 0.8571887, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.16320801, + "step": 1527, + "time_per_iteration": 2.966165542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140198, + "balance_loss_mlp": 1.12374687, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.0645510946599831, + "language_loss": 0.8449617, + "learning_rate": 0.0008281575075549485, + "loss": 0.85636371, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.16455078, + "step": 1528, + "time_per_iteration": 2.58369779586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161514, + "balance_loss_mlp": 1.14954567, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.05917981842870205, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78514206, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.11962891, + "step": 1529, + "time_per_iteration": 4.658821105957031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131087, + "balance_loss_mlp": 1.1146121, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.08930626055051794, + "language_loss": 0.90355158, + "learning_rate": 0.0008276871436402469, + "loss": 0.91486251, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.16479492, + "step": 1530, + "time_per_iteration": 2.8411099910736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136163, + "balance_loss_mlp": 1.12017739, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.15569448105103711, + "language_loss": 0.87387383, + "learning_rate": 0.000827451770808083, + "loss": 0.88523543, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.15979004, + "step": 1531, + "time_per_iteration": 2.716938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126528, + "balance_loss_mlp": 1.11020815, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.07571292712277376, + "language_loss": 0.83393914, + "learning_rate": 0.0008272162708478674, + "loss": 0.84520441, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.16320801, + "step": 1532, + "time_per_iteration": 2.589401960372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125487, + "balance_loss_mlp": 1.10926247, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.0702796828307527, + "language_loss": 0.85952383, + "learning_rate": 0.000826980643851029, + "loss": 0.87077868, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.16223145, + "step": 1533, + "time_per_iteration": 2.730564594268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111085, + "balance_loss_mlp": 1.09442306, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.090864784531222, + "language_loss": 0.84450942, + "learning_rate": 0.0008267448899090464, + "loss": 0.85561788, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.16430664, + "step": 1534, + "time_per_iteration": 2.5810909271240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116842, + "balance_loss_mlp": 1.10008121, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.07312583256714535, + "language_loss": 0.80780327, + "learning_rate": 0.0008265090091134473, + "loss": 0.81897163, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.16760254, + "step": 1535, + "time_per_iteration": 2.852243423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101211, + "balance_loss_mlp": 1.08464038, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.06558641515181687, + "language_loss": 0.80252028, + "learning_rate": 0.0008262730015558088, + "loss": 0.81353235, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.16577148, + "step": 1536, + "time_per_iteration": 2.888068675994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094086, + "balance_loss_mlp": 1.07725406, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.0890497395672015, + "language_loss": 0.81906033, + "learning_rate": 0.0008260368673277574, + "loss": 0.83000118, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.16845703, + "step": 1537, + "time_per_iteration": 3.1171438694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089572, + "balance_loss_mlp": 1.07263255, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.08897837479493585, + "language_loss": 0.83872563, + "learning_rate": 0.0008258006065209682, + "loss": 0.84962142, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.16955566, + "step": 1538, + "time_per_iteration": 2.749382972717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083685, + "balance_loss_mlp": 1.06642318, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.09390517967885302, + "language_loss": 0.80569965, + "learning_rate": 0.0008255642192271657, + "loss": 0.81653649, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.17285156, + "step": 1539, + "time_per_iteration": 2.834967851638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.07543612, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.08140985627423285, + "language_loss": 0.8348605, + "learning_rate": 0.0008253277055381241, + "loss": 0.84579086, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.17602539, + "step": 1540, + "time_per_iteration": 2.8553531169891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109997, + "balance_loss_mlp": 1.08266127, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.07492894951417867, + "language_loss": 0.8559624, + "learning_rate": 0.0008250910655456658, + "loss": 0.86696208, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.17321777, + "step": 1541, + "time_per_iteration": 3.141746997833252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_mlp": 1.10318387, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.0890193674873045, + "language_loss": 0.83764815, + "learning_rate": 0.0008248542993416625, + "loss": 0.84886062, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.18054199, + "step": 1542, + "time_per_iteration": 2.634694814682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134671, + "balance_loss_mlp": 1.11682534, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.08265783697410327, + "language_loss": 0.83617258, + "learning_rate": 0.0008246174070180352, + "loss": 0.84751928, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.17871094, + "step": 1543, + "time_per_iteration": 2.7335524559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139596, + "balance_loss_mlp": 1.12247741, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09383563779300157, + "language_loss": 0.83888161, + "learning_rate": 0.0008243803886667537, + "loss": 0.85027754, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.17138672, + "step": 1544, + "time_per_iteration": 3.1672377586364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139138, + "balance_loss_mlp": 1.12212706, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.09212665263146659, + "language_loss": 0.7881431, + "learning_rate": 0.0008241432443798364, + "loss": 0.79953444, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.17028809, + "step": 1545, + "time_per_iteration": 2.8234944343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128999, + "balance_loss_mlp": 1.11242867, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.056688876570847646, + "language_loss": 0.85312325, + "learning_rate": 0.0008239059742493512, + "loss": 0.86441326, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.16577148, + "step": 1546, + "time_per_iteration": 2.7027690410614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134853, + "balance_loss_mlp": 1.11818719, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.09085945068897121, + "language_loss": 0.87215161, + "learning_rate": 0.0008236685783674142, + "loss": 0.8835001, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.16674805, + "step": 1547, + "time_per_iteration": 3.0873892307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183829, + "balance_loss_mlp": 1.1713357, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.05428295829147524, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77405024, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.12451172, + "step": 1548, + "time_per_iteration": 4.899101972579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134552, + "balance_loss_mlp": 1.11795831, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.08128040699091903, + "language_loss": 0.81818366, + "learning_rate": 0.0008231934097178955, + "loss": 0.82952917, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.16601562, + "step": 1549, + "time_per_iteration": 2.6477086544036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139569, + "balance_loss_mlp": 1.12291551, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.07828537838902122, + "language_loss": 0.85219073, + "learning_rate": 0.0008229556371347903, + "loss": 0.86358643, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.16650391, + "step": 1550, + "time_per_iteration": 3.0261847972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150744, + "balance_loss_mlp": 1.13455498, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.08823136620200941, + "language_loss": 0.78994125, + "learning_rate": 0.0008227177391691874, + "loss": 0.8014487, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.16186523, + "step": 1551, + "time_per_iteration": 3.180002212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136289, + "balance_loss_mlp": 1.11980236, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07744125357066795, + "language_loss": 0.89299029, + "learning_rate": 0.0008224797159134463, + "loss": 0.90435314, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.16491699, + "step": 1552, + "time_per_iteration": 2.739584445953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129762, + "balance_loss_mlp": 1.11325169, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.07274609898716765, + "language_loss": 0.83059317, + "learning_rate": 0.0008222415674599765, + "loss": 0.84189081, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.16516113, + "step": 1553, + "time_per_iteration": 3.1217970848083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.10149145, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.07468995972707258, + "language_loss": 0.82944036, + "learning_rate": 0.0008220032939012349, + "loss": 0.84062493, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.1697998, + "step": 1554, + "time_per_iteration": 2.737661600112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_mlp": 1.0940038, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.06534643910619843, + "language_loss": 0.87635672, + "learning_rate": 0.0008217648953297277, + "loss": 0.88746935, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.17272949, + "step": 1555, + "time_per_iteration": 2.9030354022979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109118, + "balance_loss_mlp": 1.09171319, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.07926146627709543, + "language_loss": 0.78007799, + "learning_rate": 0.0008215263718380095, + "loss": 0.79116917, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.17419434, + "step": 1556, + "time_per_iteration": 2.7085471153259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102748, + "balance_loss_mlp": 1.08450937, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.0948368117579541, + "language_loss": 0.84609628, + "learning_rate": 0.0008212877235186833, + "loss": 0.85712373, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.18237305, + "step": 1557, + "time_per_iteration": 2.7050936222076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136692, + "balance_loss_mlp": 1.12467551, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.04579697638503373, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78874254, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12011719, + "step": 1558, + "time_per_iteration": 4.93830418586731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.08031082, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.08681594057082924, + "language_loss": 0.81027186, + "learning_rate": 0.0008208100527678611, + "loss": 0.8212539, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.17907715, + "step": 1559, + "time_per_iteration": 2.6041250228881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.08412552, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.11630596930036842, + "language_loss": 0.78128254, + "learning_rate": 0.0008205710305218135, + "loss": 0.79229701, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.17333984, + "step": 1560, + "time_per_iteration": 3.0562148094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109931, + "balance_loss_mlp": 1.08225095, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.07630099015555136, + "language_loss": 0.89525402, + "learning_rate": 0.0008203318838190541, + "loss": 0.90624714, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.17077637, + "step": 1561, + "time_per_iteration": 2.7627954483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110369, + "balance_loss_mlp": 1.08669066, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.09266250591977641, + "language_loss": 0.84876859, + "learning_rate": 0.0008200926127524281, + "loss": 0.85980552, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.17016602, + "step": 1562, + "time_per_iteration": 2.699997663497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111077, + "balance_loss_mlp": 1.09415245, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.08848358123460635, + "language_loss": 0.82834399, + "learning_rate": 0.0008198532174148289, + "loss": 0.83945167, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.16625977, + "step": 1563, + "time_per_iteration": 2.728264570236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.07691729, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.03477061119396021, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81774914, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11816406, + "step": 1564, + "time_per_iteration": 4.858918905258179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148782, + "balance_loss_mlp": 1.13198543, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.1259196892608865, + "language_loss": 0.88896626, + "learning_rate": 0.0008193740542985244, + "loss": 0.9004541, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.16809082, + "step": 1565, + "time_per_iteration": 2.6722562313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165467, + "balance_loss_mlp": 1.14907598, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.1324055806972963, + "language_loss": 0.86720473, + "learning_rate": 0.0008191342867058467, + "loss": 0.8788594, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.16394043, + "step": 1566, + "time_per_iteration": 2.7314035892486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147033, + "balance_loss_mlp": 1.13058197, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.09630003386887155, + "language_loss": 0.83068216, + "learning_rate": 0.0008188943952142509, + "loss": 0.84215248, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.16455078, + "step": 1567, + "time_per_iteration": 2.8423235416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128472, + "balance_loss_mlp": 1.11148453, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.09368409570014515, + "language_loss": 0.82277513, + "learning_rate": 0.0008186543799168711, + "loss": 0.83405983, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.17004395, + "step": 1568, + "time_per_iteration": 3.1569459438323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.07919598, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.20562474195503389, + "language_loss": 0.88231719, + "learning_rate": 0.0008184142409068892, + "loss": 0.89327747, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.16845703, + "step": 1569, + "time_per_iteration": 3.0334763526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089793, + "balance_loss_mlp": 1.0729959, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.06986495925142319, + "language_loss": 0.86445761, + "learning_rate": 0.000818173978277536, + "loss": 0.87535548, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.16809082, + "step": 1570, + "time_per_iteration": 2.6637074947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085635, + "balance_loss_mlp": 1.06840897, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.09310337511128065, + "language_loss": 0.8345744, + "learning_rate": 0.000817933592122089, + "loss": 0.84543073, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.17236328, + "step": 1571, + "time_per_iteration": 2.693112850189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_mlp": 1.06780863, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.10986906736250873, + "language_loss": 0.83327937, + "learning_rate": 0.0008176930825338749, + "loss": 0.84413558, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.17810059, + "step": 1572, + "time_per_iteration": 2.609584331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.06848717, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.10627002925019795, + "language_loss": 0.88423979, + "learning_rate": 0.0008174524496062679, + "loss": 0.89510572, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.1809082, + "step": 1573, + "time_per_iteration": 2.9317731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085921, + "balance_loss_mlp": 1.06767023, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.08890838553235277, + "language_loss": 0.85423905, + "learning_rate": 0.0008172116934326894, + "loss": 0.86509824, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.18249512, + "step": 1574, + "time_per_iteration": 2.795232057571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.06757045, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.0994527497506169, + "language_loss": 0.87673843, + "learning_rate": 0.0008169708141066097, + "loss": 0.88759637, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.18212891, + "step": 1575, + "time_per_iteration": 2.587369203567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088984, + "balance_loss_mlp": 1.07053041, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.4142555186010625, + "language_loss": 0.90523762, + "learning_rate": 0.0008167298117215465, + "loss": 0.91612744, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.18432617, + "step": 1576, + "time_per_iteration": 2.591120481491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109689, + "balance_loss_mlp": 1.07822132, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.08528414160414997, + "language_loss": 0.87905335, + "learning_rate": 0.0008164886863710649, + "loss": 0.89002216, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.18652344, + "step": 1577, + "time_per_iteration": 2.9462757110595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130352, + "balance_loss_mlp": 1.11145782, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07426584678404557, + "language_loss": 0.85645878, + "learning_rate": 0.0008162474381487783, + "loss": 0.86776227, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.1887207, + "step": 1578, + "time_per_iteration": 3.1258718967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170066, + "balance_loss_mlp": 1.15105188, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.10196200235578438, + "language_loss": 0.849518, + "learning_rate": 0.0008160060671483475, + "loss": 0.86121869, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.19018555, + "step": 1579, + "time_per_iteration": 2.686903953552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193624, + "balance_loss_mlp": 1.17542076, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.11175205501845424, + "language_loss": 0.82875144, + "learning_rate": 0.0008157645734634809, + "loss": 0.84068769, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.18212891, + "step": 1580, + "time_per_iteration": 2.623169183731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146657, + "balance_loss_mlp": 1.13449764, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.05359937724929427, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78043151, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.12158203, + "step": 1581, + "time_per_iteration": 4.941681623458862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126623, + "balance_loss_mlp": 1.11465442, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.04979857074148905, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74341118, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11962891, + "step": 1582, + "time_per_iteration": 4.878013372421265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233201, + "balance_loss_mlp": 1.21421146, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.08528831092857085, + "language_loss": 0.8396011, + "learning_rate": 0.000815039357240067, + "loss": 0.85193312, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.18969727, + "step": 1583, + "time_per_iteration": 2.643695116043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228928, + "balance_loss_mlp": 1.21003366, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.10406683839721904, + "language_loss": 0.8531003, + "learning_rate": 0.0008147973737554952, + "loss": 0.86538959, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.18884277, + "step": 1584, + "time_per_iteration": 2.780329942703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201707, + "balance_loss_mlp": 1.18393278, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.07761853967257432, + "language_loss": 0.86104375, + "learning_rate": 0.000814555268055744, + "loss": 0.87306082, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.17785645, + "step": 1585, + "time_per_iteration": 2.6921656131744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196281, + "balance_loss_mlp": 1.17799401, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07850387771459345, + "language_loss": 0.86948889, + "learning_rate": 0.0008143130402348073, + "loss": 0.88145167, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.18273926, + "step": 1586, + "time_per_iteration": 2.6515746116638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165884, + "balance_loss_mlp": 1.14803839, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.0685053805978033, + "language_loss": 0.79063147, + "learning_rate": 0.0008140706903867265, + "loss": 0.80229032, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.17858887, + "step": 1587, + "time_per_iteration": 2.823451042175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158917, + "balance_loss_mlp": 1.14067745, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.09375856425609289, + "language_loss": 0.90278405, + "learning_rate": 0.0008138282186055897, + "loss": 0.91437322, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.18261719, + "step": 1588, + "time_per_iteration": 2.7146568298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147472, + "balance_loss_mlp": 1.12988853, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.0770581210118419, + "language_loss": 0.82476223, + "learning_rate": 0.0008135856249855331, + "loss": 0.83623695, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.17614746, + "step": 1589, + "time_per_iteration": 2.71938157081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141231, + "balance_loss_mlp": 1.12317085, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.10579892777067937, + "language_loss": 0.89201659, + "learning_rate": 0.0008133429096207398, + "loss": 0.90342891, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.18066406, + "step": 1590, + "time_per_iteration": 2.828059434890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326323, + "balance_loss_mlp": 1.31087315, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.09384482719125187, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76638579, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.15429688, + "step": 1591, + "time_per_iteration": 5.056639909744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.13997602, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.07055782584393462, + "language_loss": 0.86496353, + "learning_rate": 0.0008128571140339123, + "loss": 0.87654829, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.18505859, + "step": 1592, + "time_per_iteration": 2.6639931201934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148416, + "balance_loss_mlp": 1.12930679, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.0722691659040447, + "language_loss": 0.87266612, + "learning_rate": 0.0008126140340004805, + "loss": 0.88415021, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.19104004, + "step": 1593, + "time_per_iteration": 2.574216604232788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153692, + "balance_loss_mlp": 1.1345824, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.07242693719108233, + "language_loss": 0.81765437, + "learning_rate": 0.0008123708325995172, + "loss": 0.82919127, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.19104004, + "step": 1594, + "time_per_iteration": 3.2430498600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.14182544, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.08669645453401467, + "language_loss": 0.79659396, + "learning_rate": 0.0008121275099254414, + "loss": 0.80820298, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.19067383, + "step": 1595, + "time_per_iteration": 2.992558479309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116479, + "balance_loss_mlp": 1.14517975, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06321681758762837, + "language_loss": 0.88210988, + "learning_rate": 0.0008118840660727194, + "loss": 0.8937577, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.19592285, + "step": 1596, + "time_per_iteration": 2.655043840408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116269, + "balance_loss_mlp": 1.14316404, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.06781928625830316, + "language_loss": 0.87805635, + "learning_rate": 0.0008116405011358644, + "loss": 0.88968325, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.19519043, + "step": 1597, + "time_per_iteration": 3.180513620376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172311, + "balance_loss_mlp": 1.15260601, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.0749329830796044, + "language_loss": 0.79566741, + "learning_rate": 0.0008113968152094369, + "loss": 0.80739057, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.19702148, + "step": 1598, + "time_per_iteration": 2.6038942337036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164591, + "balance_loss_mlp": 1.14439654, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.09148494515579969, + "language_loss": 0.82006347, + "learning_rate": 0.0008111530083880438, + "loss": 0.83170938, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.2019043, + "step": 1599, + "time_per_iteration": 2.9283370971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155155, + "balance_loss_mlp": 1.13517594, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.08461014219336162, + "language_loss": 0.86254573, + "learning_rate": 0.0008109090807663399, + "loss": 0.87409735, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.19970703, + "step": 1600, + "time_per_iteration": 2.825857639312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137804, + "balance_loss_mlp": 1.11677539, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.062223790852464995, + "language_loss": 0.88488859, + "learning_rate": 0.0008106650324390257, + "loss": 0.89626658, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.21032715, + "step": 1601, + "time_per_iteration": 2.8589255809783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112197, + "balance_loss_mlp": 1.10128665, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.07165476987233708, + "language_loss": 0.81206429, + "learning_rate": 0.0008104208635008493, + "loss": 0.82328397, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.20690918, + "step": 1602, + "time_per_iteration": 2.6751368045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109456, + "balance_loss_mlp": 1.0886662, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.08196336802935668, + "language_loss": 0.81529546, + "learning_rate": 0.0008101765740466058, + "loss": 0.82638997, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.20788574, + "step": 1603, + "time_per_iteration": 2.5513291358947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103563, + "balance_loss_mlp": 1.08332109, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.0890222565523069, + "language_loss": 0.83796382, + "learning_rate": 0.0008099321641711364, + "loss": 0.8489995, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.20227051, + "step": 1604, + "time_per_iteration": 2.6779870986938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104404, + "balance_loss_mlp": 1.08353007, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.07300879059514653, + "language_loss": 0.83213902, + "learning_rate": 0.0008096876339693295, + "loss": 0.84318304, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.2088623, + "step": 1605, + "time_per_iteration": 2.667900800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.07006013, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.08337461956862639, + "language_loss": 0.81168187, + "learning_rate": 0.0008094429835361206, + "loss": 0.82259107, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.20861816, + "step": 1606, + "time_per_iteration": 3.0076494216918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081794, + "balance_loss_mlp": 1.06069374, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.10542585380202701, + "language_loss": 0.85789704, + "learning_rate": 0.0008091982129664908, + "loss": 0.86871505, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.21105957, + "step": 1607, + "time_per_iteration": 2.730372428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.06643414, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.07933352528165237, + "language_loss": 0.83225489, + "learning_rate": 0.0008089533223554687, + "loss": 0.84313411, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.21484375, + "step": 1608, + "time_per_iteration": 2.7049362659454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090604, + "balance_loss_mlp": 1.06942058, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.08271353671646894, + "language_loss": 0.85293424, + "learning_rate": 0.0008087083117981294, + "loss": 0.86384022, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.21179199, + "step": 1609, + "time_per_iteration": 2.8826427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101999, + "balance_loss_mlp": 1.08043373, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.0996721022061816, + "language_loss": 0.88292408, + "learning_rate": 0.0008084631813895943, + "loss": 0.89394403, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.21569824, + "step": 1610, + "time_per_iteration": 2.7805559635162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121386, + "balance_loss_mlp": 1.10027432, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07842877021383077, + "language_loss": 0.83548594, + "learning_rate": 0.0008082179312250315, + "loss": 0.84669983, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.21118164, + "step": 1611, + "time_per_iteration": 2.676135540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388019, + "balance_loss_mlp": 1.36951745, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.08809519842771894, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81243861, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.18457031, + "step": 1612, + "time_per_iteration": 4.860031843185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126274, + "balance_loss_mlp": 1.24729049, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.05130460412725523, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77892077, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.15429688, + "step": 1613, + "time_per_iteration": 5.034562110900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199222, + "balance_loss_mlp": 1.18011272, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.0938643891544465, + "language_loss": 0.82239884, + "learning_rate": 0.0008074814631475545, + "loss": 0.83439106, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.19091797, + "step": 1614, + "time_per_iteration": 3.336702585220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212604, + "balance_loss_mlp": 1.19325638, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.08076281903906762, + "language_loss": 0.79283953, + "learning_rate": 0.0008072357349114907, + "loss": 0.80496556, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.19335938, + "step": 1615, + "time_per_iteration": 2.6835010051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230065, + "balance_loss_mlp": 1.21150458, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.10215362910815345, + "language_loss": 0.88464314, + "learning_rate": 0.0008069898873959363, + "loss": 0.89694381, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.1854248, + "step": 1616, + "time_per_iteration": 2.669456958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213455, + "balance_loss_mlp": 1.19514489, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.07300003813068634, + "language_loss": 0.85508597, + "learning_rate": 0.0008067439206963375, + "loss": 0.86722052, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.18310547, + "step": 1617, + "time_per_iteration": 2.641707420349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202163, + "balance_loss_mlp": 1.18378067, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.08997138772617237, + "language_loss": 0.86023128, + "learning_rate": 0.0008064978349081873, + "loss": 0.87225294, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.18395996, + "step": 1618, + "time_per_iteration": 2.998687982559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181054, + "balance_loss_mlp": 1.1626246, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.07073814720845698, + "language_loss": 0.8619715, + "learning_rate": 0.0008062516301270245, + "loss": 0.87378204, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.1842041, + "step": 1619, + "time_per_iteration": 2.72948956489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187406, + "balance_loss_mlp": 1.16931009, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.06466481546389395, + "language_loss": 0.88310599, + "learning_rate": 0.0008060053064484343, + "loss": 0.89498007, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.18115234, + "step": 1620, + "time_per_iteration": 2.9406392574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_mlp": 1.17067063, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.09059197010434686, + "language_loss": 0.84835637, + "learning_rate": 0.0008057588639680482, + "loss": 0.86024034, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.17724609, + "step": 1621, + "time_per_iteration": 2.7712435722351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.15451908, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.0998699448976919, + "language_loss": 0.83181798, + "learning_rate": 0.0008055123027815434, + "loss": 0.84354383, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.18078613, + "step": 1622, + "time_per_iteration": 2.918195962905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158801, + "balance_loss_mlp": 1.14063358, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.08307305946300769, + "language_loss": 0.8472932, + "learning_rate": 0.0008052656229846436, + "loss": 0.85888124, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.18164062, + "step": 1623, + "time_per_iteration": 2.6911518573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141486, + "balance_loss_mlp": 1.12317586, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.13857337515277973, + "language_loss": 0.90054119, + "learning_rate": 0.0008050188246731182, + "loss": 0.91195607, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.18322754, + "step": 1624, + "time_per_iteration": 2.682352066040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132158, + "balance_loss_mlp": 1.11350143, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.07575228871239431, + "language_loss": 0.81929862, + "learning_rate": 0.0008047719079427834, + "loss": 0.83062017, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.18664551, + "step": 1625, + "time_per_iteration": 2.9942879676818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230131, + "balance_loss_mlp": 1.21601677, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.048676192852424666, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75581837, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.14160156, + "step": 1626, + "time_per_iteration": 4.848233938217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108724, + "balance_loss_mlp": 1.08925653, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.0694146578244244, + "language_loss": 0.86078912, + "learning_rate": 0.0008042777196091757, + "loss": 0.87187636, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.19458008, + "step": 1627, + "time_per_iteration": 2.701900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116209, + "balance_loss_mlp": 1.09631276, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.08749628678496815, + "language_loss": 0.81888652, + "learning_rate": 0.0008040304481977643, + "loss": 0.83004862, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.19885254, + "step": 1628, + "time_per_iteration": 2.696526527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138715, + "balance_loss_mlp": 1.11946249, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.07447099765210985, + "language_loss": 0.8675555, + "learning_rate": 0.0008037830587512649, + "loss": 0.87894267, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.19250488, + "step": 1629, + "time_per_iteration": 3.0616016387939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134253, + "balance_loss_mlp": 1.11413062, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.09771619875867958, + "language_loss": 0.78561771, + "learning_rate": 0.0008035355513657224, + "loss": 0.79696023, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.20117188, + "step": 1630, + "time_per_iteration": 2.4754045009613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137016, + "balance_loss_mlp": 1.11708379, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.08006054346576318, + "language_loss": 0.9267844, + "learning_rate": 0.0008032879261372279, + "loss": 0.93815458, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.19921875, + "step": 1631, + "time_per_iteration": 2.802116870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162193, + "balance_loss_mlp": 1.14845991, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.027777304949473513, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80798036, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.13769531, + "step": 1632, + "time_per_iteration": 5.508919715881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119807, + "balance_loss_mlp": 1.10029221, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.0647776963699187, + "language_loss": 0.86985779, + "learning_rate": 0.0008027923225359748, + "loss": 0.88105589, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.19506836, + "step": 1633, + "time_per_iteration": 2.600407600402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108986, + "balance_loss_mlp": 1.08867252, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.07494658582155435, + "language_loss": 0.87969911, + "learning_rate": 0.0008025443443556267, + "loss": 0.89078891, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.20300293, + "step": 1634, + "time_per_iteration": 2.721635103225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103961, + "balance_loss_mlp": 1.08468509, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.09628820684288855, + "language_loss": 0.88015246, + "learning_rate": 0.000802296248717147, + "loss": 0.89119208, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.19262695, + "step": 1635, + "time_per_iteration": 2.94401478767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090786, + "balance_loss_mlp": 1.07087779, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.07971253455476307, + "language_loss": 0.78918988, + "learning_rate": 0.0008020480357168554, + "loss": 0.8000977, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.19897461, + "step": 1636, + "time_per_iteration": 2.863992691040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089663, + "balance_loss_mlp": 1.07011271, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.07737806088204505, + "language_loss": 0.87917638, + "learning_rate": 0.0008017997054511165, + "loss": 0.890073, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.1953125, + "step": 1637, + "time_per_iteration": 2.586543083190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087265, + "balance_loss_mlp": 1.06765532, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.08038806705740831, + "language_loss": 0.85134554, + "learning_rate": 0.0008015512580163407, + "loss": 0.86221826, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.19592285, + "step": 1638, + "time_per_iteration": 2.8016490936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.06364322, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.07403915674476273, + "language_loss": 0.80143899, + "learning_rate": 0.0008013026935089838, + "loss": 0.81228203, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.20666504, + "step": 1639, + "time_per_iteration": 2.906219244003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086238, + "balance_loss_mlp": 1.06543589, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.08080644571808258, + "language_loss": 0.83962494, + "learning_rate": 0.0008010540120255472, + "loss": 0.85048735, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.20788574, + "step": 1640, + "time_per_iteration": 2.6874494552612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093894, + "balance_loss_mlp": 1.07238901, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.10412897550370145, + "language_loss": 0.85903674, + "learning_rate": 0.0008008052136625774, + "loss": 0.86997569, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.21508789, + "step": 1641, + "time_per_iteration": 2.806689977645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101865, + "balance_loss_mlp": 1.08080053, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.07569050828740802, + "language_loss": 0.86666101, + "learning_rate": 0.0008005562985166666, + "loss": 0.87767971, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.21069336, + "step": 1642, + "time_per_iteration": 2.7800753116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109644, + "balance_loss_mlp": 1.08823395, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.05889143992207802, + "language_loss": 0.85174221, + "learning_rate": 0.0008003072666844524, + "loss": 0.86283863, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.21411133, + "step": 1643, + "time_per_iteration": 2.722987651824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122613, + "balance_loss_mlp": 1.10185909, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.0837642836105996, + "language_loss": 0.82220256, + "learning_rate": 0.0008000581182626173, + "loss": 0.83342868, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.20751953, + "step": 1644, + "time_per_iteration": 2.5624425411224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143306, + "balance_loss_mlp": 1.12279046, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.21399278605623545, + "language_loss": 0.85377562, + "learning_rate": 0.0007998088533478894, + "loss": 0.86520875, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.2052002, + "step": 1645, + "time_per_iteration": 2.657808542251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.09847164, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.1165927047614104, + "language_loss": 0.83989012, + "learning_rate": 0.000799559472037042, + "loss": 0.85107368, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.19873047, + "step": 1646, + "time_per_iteration": 2.5764071941375732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101642, + "balance_loss_mlp": 1.08161449, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.06134983371250154, + "language_loss": 0.87497842, + "learning_rate": 0.0007993099744268932, + "loss": 0.88599485, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.20031738, + "step": 1647, + "time_per_iteration": 2.9123756885528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094967, + "balance_loss_mlp": 1.07502329, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.08774031682584008, + "language_loss": 0.87840933, + "learning_rate": 0.000799060360614307, + "loss": 0.889359, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.19934082, + "step": 1648, + "time_per_iteration": 2.7346584796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089439, + "balance_loss_mlp": 1.06954336, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.07558157708493889, + "language_loss": 0.8330996, + "learning_rate": 0.0007988106306961917, + "loss": 0.84399396, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.19885254, + "step": 1649, + "time_per_iteration": 3.1326329708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091589, + "balance_loss_mlp": 1.07182384, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.0875083493892423, + "language_loss": 0.84519339, + "learning_rate": 0.0007985607847695014, + "loss": 0.85610926, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.19750977, + "step": 1650, + "time_per_iteration": 2.689587354660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087502, + "balance_loss_mlp": 1.06813097, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.10331276722207645, + "language_loss": 0.82647395, + "learning_rate": 0.0007983108229312345, + "loss": 0.83734906, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.19348145, + "step": 1651, + "time_per_iteration": 2.935060501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094077, + "balance_loss_mlp": 1.07493234, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.08920057207213788, + "language_loss": 0.86297011, + "learning_rate": 0.0007980607452784351, + "loss": 0.8739109, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.19128418, + "step": 1652, + "time_per_iteration": 2.5893616676330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090057, + "balance_loss_mlp": 1.07070947, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.10003790987475829, + "language_loss": 0.90127802, + "learning_rate": 0.0007978105519081919, + "loss": 0.91217864, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.1932373, + "step": 1653, + "time_per_iteration": 2.7026524543762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091886, + "balance_loss_mlp": 1.07306278, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.08393617058492224, + "language_loss": 0.87581307, + "learning_rate": 0.0007975602429176385, + "loss": 0.88673192, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.18811035, + "step": 1654, + "time_per_iteration": 2.652863025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110285, + "balance_loss_mlp": 1.08389616, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.08283763038644905, + "language_loss": 0.8141948, + "learning_rate": 0.0007973098184039536, + "loss": 0.82522333, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.18933105, + "step": 1655, + "time_per_iteration": 2.658590316772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113313, + "balance_loss_mlp": 1.09477568, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.08159903981201219, + "language_loss": 0.86618698, + "learning_rate": 0.0007970592784643602, + "loss": 0.87732017, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.18518066, + "step": 1656, + "time_per_iteration": 2.892390251159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138107, + "balance_loss_mlp": 1.11967695, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.07828329710087445, + "language_loss": 0.84808218, + "learning_rate": 0.0007968086231961272, + "loss": 0.85946327, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.18432617, + "step": 1657, + "time_per_iteration": 2.659250497817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169742, + "balance_loss_mlp": 1.15010786, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.1537731911276923, + "language_loss": 0.8331663, + "learning_rate": 0.0007965578526965671, + "loss": 0.84486371, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.19616699, + "step": 1658, + "time_per_iteration": 2.6129345893859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115739, + "balance_loss_mlp": 1.13819742, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.07993574913147765, + "language_loss": 0.86468869, + "learning_rate": 0.0007963069670630377, + "loss": 0.87626261, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.19189453, + "step": 1659, + "time_per_iteration": 2.735495090484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150627, + "balance_loss_mlp": 1.13118374, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.07695546581371572, + "language_loss": 0.87941194, + "learning_rate": 0.0007960559663929416, + "loss": 0.8909182, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.19421387, + "step": 1660, + "time_per_iteration": 2.6464481353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144507, + "balance_loss_mlp": 1.12452734, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.0701260521045673, + "language_loss": 0.87574112, + "learning_rate": 0.0007958048507837259, + "loss": 0.88718617, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.19995117, + "step": 1661, + "time_per_iteration": 2.964620590209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135721, + "balance_loss_mlp": 1.11478782, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.08820049354030167, + "language_loss": 0.87464488, + "learning_rate": 0.0007955536203328822, + "loss": 0.88600206, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.20947266, + "step": 1662, + "time_per_iteration": 2.9402856826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128807, + "balance_loss_mlp": 1.10893452, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.0703581314218412, + "language_loss": 0.83491433, + "learning_rate": 0.0007953022751379469, + "loss": 0.84620237, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.1986084, + "step": 1663, + "time_per_iteration": 2.8694913387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133101, + "balance_loss_mlp": 1.11183429, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.07762769933283196, + "language_loss": 0.81855732, + "learning_rate": 0.000795050815296501, + "loss": 0.82988834, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.21264648, + "step": 1664, + "time_per_iteration": 2.9839534759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133942, + "balance_loss_mlp": 1.11387873, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.06538130148842129, + "language_loss": 0.92802906, + "learning_rate": 0.0007947992409061695, + "loss": 0.93936849, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.20068359, + "step": 1665, + "time_per_iteration": 2.600677013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128395, + "balance_loss_mlp": 1.10815299, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07570782620206934, + "language_loss": 0.86083347, + "learning_rate": 0.0007945475520646226, + "loss": 0.8721174, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.20227051, + "step": 1666, + "time_per_iteration": 2.960444211959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126888, + "balance_loss_mlp": 1.10798109, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.08296696017450861, + "language_loss": 0.84656757, + "learning_rate": 0.0007942957488695743, + "loss": 0.85783648, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.18908691, + "step": 1667, + "time_per_iteration": 2.671600341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131636, + "balance_loss_mlp": 1.11284864, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06557982969248469, + "language_loss": 0.80884814, + "learning_rate": 0.0007940438314187833, + "loss": 0.82016456, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.18774414, + "step": 1668, + "time_per_iteration": 3.0618937015533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129602, + "balance_loss_mlp": 1.11102939, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.08496063360517363, + "language_loss": 0.80308306, + "learning_rate": 0.0007937917998100529, + "loss": 0.8143791, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.18566895, + "step": 1669, + "time_per_iteration": 2.6219253540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139247, + "balance_loss_mlp": 1.12098432, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.07361411804364891, + "language_loss": 0.78932178, + "learning_rate": 0.0007935396541412302, + "loss": 0.80071419, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.18273926, + "step": 1670, + "time_per_iteration": 2.6380372047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148818, + "balance_loss_mlp": 1.13088846, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.07283292072888313, + "language_loss": 0.85630834, + "learning_rate": 0.0007932873945102068, + "loss": 0.86779654, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.17932129, + "step": 1671, + "time_per_iteration": 2.6828458309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171107, + "balance_loss_mlp": 1.15642071, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.02887484158654099, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76932883, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.14648438, + "step": 1672, + "time_per_iteration": 4.8265416622161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160939, + "balance_loss_mlp": 1.14286733, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.07500648032395062, + "language_loss": 0.86484933, + "learning_rate": 0.0007927825337533461, + "loss": 0.87645876, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.18078613, + "step": 1673, + "time_per_iteration": 2.7402546405792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155615, + "balance_loss_mlp": 1.1377933, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.10786589074132553, + "language_loss": 0.84594876, + "learning_rate": 0.0007925299328235131, + "loss": 0.8575049, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.17822266, + "step": 1674, + "time_per_iteration": 2.663360118865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149086, + "balance_loss_mlp": 1.13095438, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.09107412637612472, + "language_loss": 0.84947217, + "learning_rate": 0.000792277218323488, + "loss": 0.86096299, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.18139648, + "step": 1675, + "time_per_iteration": 2.608579158782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136338, + "balance_loss_mlp": 1.11837292, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.07405590971136047, + "language_loss": 0.84631819, + "learning_rate": 0.0007920243903513833, + "loss": 0.85768151, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.17956543, + "step": 1676, + "time_per_iteration": 2.598543882369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128075, + "balance_loss_mlp": 1.10991931, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.08030295134522303, + "language_loss": 0.83944809, + "learning_rate": 0.0007917714490053556, + "loss": 0.85072881, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.18164062, + "step": 1677, + "time_per_iteration": 2.6944823265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126784, + "balance_loss_mlp": 1.10863996, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.06747924585348261, + "language_loss": 0.86233467, + "learning_rate": 0.0007915183943836055, + "loss": 0.87360251, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.18164062, + "step": 1678, + "time_per_iteration": 2.9165165424346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120975, + "balance_loss_mlp": 1.10280752, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.11051827421234449, + "language_loss": 0.84204686, + "learning_rate": 0.0007912652265843773, + "loss": 0.85325664, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.18164062, + "step": 1679, + "time_per_iteration": 3.141361713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108875, + "balance_loss_mlp": 1.09056485, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.06834343380772315, + "language_loss": 0.81678128, + "learning_rate": 0.0007910119457059597, + "loss": 0.82787001, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.1829834, + "step": 1680, + "time_per_iteration": 2.7235679626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097161, + "balance_loss_mlp": 1.07836151, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.08108919878534793, + "language_loss": 0.80109823, + "learning_rate": 0.0007907585518466849, + "loss": 0.81206989, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.18798828, + "step": 1681, + "time_per_iteration": 2.9778435230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096448, + "balance_loss_mlp": 1.07823253, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.07179806444318433, + "language_loss": 0.89356047, + "learning_rate": 0.000790505045104929, + "loss": 0.90452492, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.18200684, + "step": 1682, + "time_per_iteration": 2.522502899169922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092596, + "balance_loss_mlp": 1.07453537, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.07276753556485034, + "language_loss": 0.86845744, + "learning_rate": 0.0007902514255791125, + "loss": 0.87938344, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.18066406, + "step": 1683, + "time_per_iteration": 2.7951602935791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094831, + "balance_loss_mlp": 1.07612705, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.14328213003802046, + "language_loss": 0.87945193, + "learning_rate": 0.0007899976933676986, + "loss": 0.89040023, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.18701172, + "step": 1684, + "time_per_iteration": 3.0410313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095322, + "balance_loss_mlp": 1.0759027, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.09505530250353386, + "language_loss": 0.8717491, + "learning_rate": 0.0007897438485691955, + "loss": 0.88270235, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.19396973, + "step": 1685, + "time_per_iteration": 2.717643976211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109744, + "balance_loss_mlp": 1.09030128, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.0737580177172555, + "language_loss": 0.82153177, + "learning_rate": 0.0007894898912821542, + "loss": 0.8326292, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.19433594, + "step": 1686, + "time_per_iteration": 2.529229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103459, + "balance_loss_mlp": 1.0848738, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.06566778614017829, + "language_loss": 0.86626494, + "learning_rate": 0.0007892358216051695, + "loss": 0.87729949, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.18566895, + "step": 1687, + "time_per_iteration": 2.7486979961395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103486, + "balance_loss_mlp": 1.08472204, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.06759540868164342, + "language_loss": 0.91712224, + "learning_rate": 0.0007889816396368803, + "loss": 0.92815715, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.18737793, + "step": 1688, + "time_per_iteration": 2.6558406352996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114869, + "balance_loss_mlp": 1.09629631, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.08904939998236257, + "language_loss": 0.85158062, + "learning_rate": 0.0007887273454759687, + "loss": 0.86272931, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.18566895, + "step": 1689, + "time_per_iteration": 2.4704487323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120516, + "balance_loss_mlp": 1.10219383, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.07572457526068059, + "language_loss": 0.82346898, + "learning_rate": 0.0007884729392211603, + "loss": 0.83467412, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.18322754, + "step": 1690, + "time_per_iteration": 2.703683614730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110893, + "balance_loss_mlp": 1.09243917, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09550307140961752, + "language_loss": 0.85592222, + "learning_rate": 0.0007882184209712245, + "loss": 0.86703116, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.18444824, + "step": 1691, + "time_per_iteration": 2.560342788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103123, + "balance_loss_mlp": 1.0847528, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.06639873617663411, + "language_loss": 0.85215127, + "learning_rate": 0.000787963790824974, + "loss": 0.86318254, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.18371582, + "step": 1692, + "time_per_iteration": 3.01053786277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102989, + "balance_loss_mlp": 1.08483362, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.0791061376464097, + "language_loss": 0.89282072, + "learning_rate": 0.0007877090488812651, + "loss": 0.90385056, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.18164062, + "step": 1693, + "time_per_iteration": 2.4398083686828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101181, + "balance_loss_mlp": 1.08242917, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.07726533895166562, + "language_loss": 0.8386811, + "learning_rate": 0.0007874541952389973, + "loss": 0.84969294, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.1875, + "step": 1694, + "time_per_iteration": 2.6756813526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104451, + "balance_loss_mlp": 1.08591402, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.08042259552829657, + "language_loss": 0.86563015, + "learning_rate": 0.0007871992299971136, + "loss": 0.87667465, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.1854248, + "step": 1695, + "time_per_iteration": 2.5899436473846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114554, + "balance_loss_mlp": 1.096017, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.10859344338562153, + "language_loss": 0.84131289, + "learning_rate": 0.0007869441532546001, + "loss": 0.85245848, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.18530273, + "step": 1696, + "time_per_iteration": 2.7561304569244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107684, + "balance_loss_mlp": 1.08946884, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.10465149109525512, + "language_loss": 0.79480183, + "learning_rate": 0.0007866889651104867, + "loss": 0.8058787, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.18225098, + "step": 1697, + "time_per_iteration": 2.8031740188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108686, + "balance_loss_mlp": 1.08992255, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.0906406666849178, + "language_loss": 0.83109629, + "learning_rate": 0.000786433665663846, + "loss": 0.84218317, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.18762207, + "step": 1698, + "time_per_iteration": 2.6932730674743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_mlp": 1.08788502, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.09684550827651525, + "language_loss": 0.86934984, + "learning_rate": 0.0007861782550137942, + "loss": 0.88041353, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.18481445, + "step": 1699, + "time_per_iteration": 2.924246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111141, + "balance_loss_mlp": 1.09345734, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.08559105168392155, + "language_loss": 0.85866642, + "learning_rate": 0.0007859227332594901, + "loss": 0.86978048, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.17956543, + "step": 1700, + "time_per_iteration": 2.930842876434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106023, + "balance_loss_mlp": 1.0883081, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.09580530814462011, + "language_loss": 0.84299338, + "learning_rate": 0.0007856671005001365, + "loss": 0.85405362, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.17712402, + "step": 1701, + "time_per_iteration": 3.2081515789031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110257, + "balance_loss_mlp": 1.09185123, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.08565892816740808, + "language_loss": 0.81811458, + "learning_rate": 0.0007854113568349787, + "loss": 0.8292172, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.18408203, + "step": 1702, + "time_per_iteration": 3.1229259967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107177, + "balance_loss_mlp": 1.08861589, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.07794566968546403, + "language_loss": 0.80742395, + "learning_rate": 0.0007851555023633052, + "loss": 0.81849575, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.18554688, + "step": 1703, + "time_per_iteration": 2.87683367729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093514, + "balance_loss_mlp": 1.07504809, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.08579630919656539, + "language_loss": 0.82316363, + "learning_rate": 0.0007848995371844474, + "loss": 0.83409876, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.18469238, + "step": 1704, + "time_per_iteration": 2.543123483657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108458, + "balance_loss_mlp": 1.09000456, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.08180134109500492, + "language_loss": 0.80497056, + "learning_rate": 0.0007846434613977801, + "loss": 0.81605512, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.18444824, + "step": 1705, + "time_per_iteration": 2.5694901943206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096099, + "balance_loss_mlp": 1.07726395, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.08642702147252447, + "language_loss": 0.7816267, + "learning_rate": 0.0007843872751027203, + "loss": 0.79258776, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.18835449, + "step": 1706, + "time_per_iteration": 2.8476855754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091232, + "balance_loss_mlp": 1.07206345, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.07466271413415602, + "language_loss": 0.87096149, + "learning_rate": 0.0007841309783987287, + "loss": 0.88187379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.19152832, + "step": 1707, + "time_per_iteration": 2.752048969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.0709219, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.08448532304164387, + "language_loss": 0.8909331, + "learning_rate": 0.0007838745713853084, + "loss": 0.90183651, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.1940918, + "step": 1708, + "time_per_iteration": 2.576037883758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085126, + "balance_loss_mlp": 1.06595731, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.08173004229220915, + "language_loss": 0.84132832, + "learning_rate": 0.0007836180541620053, + "loss": 0.85217953, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.19152832, + "step": 1709, + "time_per_iteration": 2.7169644832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084228, + "balance_loss_mlp": 1.06489253, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.09936539185168088, + "language_loss": 0.86458898, + "learning_rate": 0.0007833614268284082, + "loss": 0.8754313, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.19311523, + "step": 1710, + "time_per_iteration": 2.5532357692718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119417, + "balance_loss_mlp": 1.17919695, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.0502772245871811, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75303936, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.14941406, + "step": 1711, + "time_per_iteration": 4.93800163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084968, + "balance_loss_mlp": 1.06610942, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.0930127101012754, + "language_loss": 0.78468674, + "learning_rate": 0.0007828478422289016, + "loss": 0.7955364, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.18835449, + "step": 1712, + "time_per_iteration": 2.6106202602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094707, + "balance_loss_mlp": 1.0755266, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.07722441463790092, + "language_loss": 0.88823062, + "learning_rate": 0.0007825908851623833, + "loss": 0.89917773, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.19165039, + "step": 1713, + "time_per_iteration": 2.7708652019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099793, + "balance_loss_mlp": 1.08030224, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.08538102567636462, + "language_loss": 0.84563339, + "learning_rate": 0.0007823338183843533, + "loss": 0.85663128, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.19482422, + "step": 1714, + "time_per_iteration": 2.6919374465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101813, + "balance_loss_mlp": 1.08302569, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.10472435712491576, + "language_loss": 0.80579829, + "learning_rate": 0.0007820766419946141, + "loss": 0.81681645, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.18762207, + "step": 1715, + "time_per_iteration": 3.3962650299072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133772, + "balance_loss_mlp": 1.12051618, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.022367363269540627, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80806249, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.1328125, + "step": 1716, + "time_per_iteration": 4.940594434738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117089, + "balance_loss_mlp": 1.0989933, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.07989503427724588, + "language_loss": 0.7557565, + "learning_rate": 0.0007815619607794288, + "loss": 0.76692742, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.1809082, + "step": 1717, + "time_per_iteration": 2.6619300842285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112079, + "balance_loss_mlp": 1.10175252, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.08732146715249756, + "language_loss": 0.82213569, + "learning_rate": 0.0007813044561538001, + "loss": 0.83334363, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.19030762, + "step": 1718, + "time_per_iteration": 3.146427869796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118808, + "balance_loss_mlp": 1.0996747, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.07987567281751332, + "language_loss": 0.88114393, + "learning_rate": 0.0007810468423160958, + "loss": 0.89233208, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.19128418, + "step": 1719, + "time_per_iteration": 2.882783889770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116955, + "balance_loss_mlp": 1.09883487, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.07516231806962957, + "language_loss": 0.81837869, + "learning_rate": 0.0007807891193663306, + "loss": 0.82954824, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.18127441, + "step": 1720, + "time_per_iteration": 2.817091464996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115372, + "balance_loss_mlp": 1.09681106, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.08207921946386207, + "language_loss": 0.82360268, + "learning_rate": 0.0007805312874045614, + "loss": 0.83475637, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.18566895, + "step": 1721, + "time_per_iteration": 2.5788111686706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127428, + "balance_loss_mlp": 1.10856915, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.08587725731854692, + "language_loss": 0.86701787, + "learning_rate": 0.0007802733465308874, + "loss": 0.87829208, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.18847656, + "step": 1722, + "time_per_iteration": 2.47092866897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134903, + "balance_loss_mlp": 1.11681938, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.07875115394989439, + "language_loss": 0.84537411, + "learning_rate": 0.0007800152968454501, + "loss": 0.85672319, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.1809082, + "step": 1723, + "time_per_iteration": 2.689821481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134288, + "balance_loss_mlp": 1.1161443, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.07553816314554183, + "language_loss": 0.90259147, + "learning_rate": 0.0007797571384484334, + "loss": 0.91393435, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.18139648, + "step": 1724, + "time_per_iteration": 2.881140947341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130284, + "balance_loss_mlp": 1.11211705, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.09124178304656469, + "language_loss": 0.91919303, + "learning_rate": 0.0007794988714400633, + "loss": 0.93049586, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.18164062, + "step": 1725, + "time_per_iteration": 2.6405282020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127051, + "balance_loss_mlp": 1.10823941, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.08426272849970545, + "language_loss": 0.85092092, + "learning_rate": 0.0007792404959206079, + "loss": 0.8621915, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.18798828, + "step": 1726, + "time_per_iteration": 2.5432610511779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127088, + "balance_loss_mlp": 1.1084559, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.07425680572728817, + "language_loss": 0.81119555, + "learning_rate": 0.0007789820119903774, + "loss": 0.82246637, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.1862793, + "step": 1727, + "time_per_iteration": 3.032222270965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139545, + "balance_loss_mlp": 1.12562108, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.028014537923784853, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79632211, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.13964844, + "step": 1728, + "time_per_iteration": 4.8402745723724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_mlp": 1.11797178, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.0895292490434253, + "language_loss": 0.8341223, + "learning_rate": 0.0007784647192990428, + "loss": 0.84549034, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.18798828, + "step": 1729, + "time_per_iteration": 2.732290267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138805, + "balance_loss_mlp": 1.11925435, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.13711052560491443, + "language_loss": 0.80506217, + "learning_rate": 0.0007782059107387696, + "loss": 0.81645024, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.1953125, + "step": 1730, + "time_per_iteration": 2.8793182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114255, + "balance_loss_mlp": 1.12199879, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.08825875418673053, + "language_loss": 0.8822093, + "learning_rate": 0.0007779469941693826, + "loss": 0.8936348, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.20556641, + "step": 1731, + "time_per_iteration": 2.862053632736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136737, + "balance_loss_mlp": 1.11668622, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.0849632369239172, + "language_loss": 0.77099073, + "learning_rate": 0.0007776879696914029, + "loss": 0.78235817, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.20043945, + "step": 1732, + "time_per_iteration": 2.878997325897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137118, + "balance_loss_mlp": 1.11639929, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.06630968591745413, + "language_loss": 0.88863558, + "learning_rate": 0.000777428837405392, + "loss": 0.90000677, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.20715332, + "step": 1733, + "time_per_iteration": 2.849579095840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113514, + "balance_loss_mlp": 1.1140877, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.1678685499329745, + "language_loss": 0.86820018, + "learning_rate": 0.0007771695974119544, + "loss": 0.87955153, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.21069336, + "step": 1734, + "time_per_iteration": 2.5213568210601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011438, + "balance_loss_mlp": 1.12223458, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.07580918658919847, + "language_loss": 0.75353694, + "learning_rate": 0.0007769102498117359, + "loss": 0.76497495, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.21569824, + "step": 1735, + "time_per_iteration": 3.1764426231384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152046, + "balance_loss_mlp": 1.12946832, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.07940235688963863, + "language_loss": 0.79215956, + "learning_rate": 0.000776650794705424, + "loss": 0.80368006, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.22570801, + "step": 1736, + "time_per_iteration": 3.311570644378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150562, + "balance_loss_mlp": 1.12822187, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.07154101803961593, + "language_loss": 0.82120311, + "learning_rate": 0.0007763912321937483, + "loss": 0.83270872, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.22351074, + "step": 1737, + "time_per_iteration": 2.7742059230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162448, + "balance_loss_mlp": 1.14046574, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.09893982821491046, + "language_loss": 0.82392818, + "learning_rate": 0.0007761315623774799, + "loss": 0.83555263, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.21972656, + "step": 1738, + "time_per_iteration": 3.4311513900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158796, + "balance_loss_mlp": 1.1368016, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.09029538875627986, + "language_loss": 0.87794083, + "learning_rate": 0.0007758717853574313, + "loss": 0.88952881, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.22009277, + "step": 1739, + "time_per_iteration": 2.771195411682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165102, + "balance_loss_mlp": 1.14437175, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.0906994231642372, + "language_loss": 0.89945674, + "learning_rate": 0.0007756119012344571, + "loss": 0.91110778, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.20739746, + "step": 1740, + "time_per_iteration": 2.60304594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150213, + "balance_loss_mlp": 1.12998307, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.09292231464176055, + "language_loss": 0.8424325, + "learning_rate": 0.0007753519101094535, + "loss": 0.85393465, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.20227051, + "step": 1741, + "time_per_iteration": 2.763831377029419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130901, + "balance_loss_mlp": 1.11101699, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.09107418087972757, + "language_loss": 0.86003816, + "learning_rate": 0.0007750918120833575, + "loss": 0.87134719, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.19873047, + "step": 1742, + "time_per_iteration": 2.5983192920684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110821, + "balance_loss_mlp": 1.08914852, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.08951756084527424, + "language_loss": 0.86919558, + "learning_rate": 0.0007748316072571485, + "loss": 0.88027763, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.19042969, + "step": 1743, + "time_per_iteration": 2.826857328414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096104, + "balance_loss_mlp": 1.07641089, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.07101368717418235, + "language_loss": 0.78953618, + "learning_rate": 0.0007745712957318467, + "loss": 0.80049723, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.19677734, + "step": 1744, + "time_per_iteration": 2.9848310947418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099368, + "balance_loss_mlp": 1.08075917, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06728871536655502, + "language_loss": 0.86402392, + "learning_rate": 0.0007743108776085141, + "loss": 0.87501758, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.18603516, + "step": 1745, + "time_per_iteration": 2.7903690338134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100918, + "balance_loss_mlp": 1.08167791, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.08105774730722601, + "language_loss": 0.83074069, + "learning_rate": 0.0007740503529882543, + "loss": 0.84174985, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.19238281, + "step": 1746, + "time_per_iteration": 2.8164098262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102514, + "balance_loss_mlp": 1.08327341, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.08939656691142209, + "language_loss": 0.90720791, + "learning_rate": 0.0007737897219722114, + "loss": 0.91823304, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.19226074, + "step": 1747, + "time_per_iteration": 2.682877540588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098067, + "balance_loss_mlp": 1.07800448, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.08976840313873562, + "language_loss": 0.81010032, + "learning_rate": 0.0007735289846615716, + "loss": 0.82108104, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.20068359, + "step": 1748, + "time_per_iteration": 2.687856674194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096768, + "balance_loss_mlp": 1.07715857, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.08605901070846078, + "language_loss": 0.81949353, + "learning_rate": 0.0007732681411575621, + "loss": 0.83046126, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.19616699, + "step": 1749, + "time_per_iteration": 2.711014747619629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100357, + "balance_loss_mlp": 1.08002043, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.0865041685268045, + "language_loss": 0.87347746, + "learning_rate": 0.0007730071915614514, + "loss": 0.88448107, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.20349121, + "step": 1750, + "time_per_iteration": 2.7877442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097656, + "balance_loss_mlp": 1.07754588, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.099917727371098, + "language_loss": 0.88751096, + "learning_rate": 0.0007727461359745489, + "loss": 0.89848751, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.2010498, + "step": 1751, + "time_per_iteration": 2.5344979763031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110101, + "balance_loss_mlp": 1.09051538, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.06874041131201088, + "language_loss": 0.85970122, + "learning_rate": 0.0007724849744982056, + "loss": 0.87080222, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.19592285, + "step": 1752, + "time_per_iteration": 2.7278292179107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118351, + "balance_loss_mlp": 1.09820437, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.07532767444648983, + "language_loss": 0.81245279, + "learning_rate": 0.0007722237072338131, + "loss": 0.82363629, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.20141602, + "step": 1753, + "time_per_iteration": 2.715123414993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129336, + "balance_loss_mlp": 1.10946393, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.09907858659655516, + "language_loss": 0.85174322, + "learning_rate": 0.0007719623342828046, + "loss": 0.86303657, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.1986084, + "step": 1754, + "time_per_iteration": 2.580603837966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011433, + "balance_loss_mlp": 1.12336826, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.09468217220840029, + "language_loss": 0.84008503, + "learning_rate": 0.000771700855746654, + "loss": 0.85151798, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.19934082, + "step": 1755, + "time_per_iteration": 2.6360206604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115036, + "balance_loss_mlp": 1.13060665, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.06173278613548714, + "language_loss": 0.8813622, + "learning_rate": 0.0007714392717268763, + "loss": 0.89286578, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.19750977, + "step": 1756, + "time_per_iteration": 2.610471725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.14999521, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.08560719953811556, + "language_loss": 0.86437309, + "learning_rate": 0.0007711775823250273, + "loss": 0.87606871, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.19555664, + "step": 1757, + "time_per_iteration": 2.5406768321990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179489, + "balance_loss_mlp": 1.16010547, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.06814979795763555, + "language_loss": 0.82866555, + "learning_rate": 0.0007709157876427039, + "loss": 0.84046042, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.19372559, + "step": 1758, + "time_per_iteration": 3.144188642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152293, + "balance_loss_mlp": 1.13320732, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.08381425857535812, + "language_loss": 0.85356963, + "learning_rate": 0.0007706538877815439, + "loss": 0.86509264, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.1907959, + "step": 1759, + "time_per_iteration": 2.6544251441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145804, + "balance_loss_mlp": 1.12751722, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.07160952497477109, + "language_loss": 0.83250809, + "learning_rate": 0.0007703918828432259, + "loss": 0.84396613, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.18273926, + "step": 1760, + "time_per_iteration": 2.639800548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139561, + "balance_loss_mlp": 1.12061834, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.07528387784347967, + "language_loss": 0.89063478, + "learning_rate": 0.000770129772929469, + "loss": 0.90203035, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.18933105, + "step": 1761, + "time_per_iteration": 2.690807580947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143571, + "balance_loss_mlp": 1.12493849, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.07941213480930635, + "language_loss": 0.87791038, + "learning_rate": 0.0007698675581420334, + "loss": 0.88934612, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.18615723, + "step": 1762, + "time_per_iteration": 2.897935390472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135098, + "balance_loss_mlp": 1.11646509, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.08353352960784785, + "language_loss": 0.78453314, + "learning_rate": 0.0007696052385827199, + "loss": 0.79588407, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.18603516, + "step": 1763, + "time_per_iteration": 2.960893154144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144695, + "balance_loss_mlp": 1.12652755, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.0785374693184301, + "language_loss": 0.77934641, + "learning_rate": 0.00076934281435337, + "loss": 0.7907933, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.18188477, + "step": 1764, + "time_per_iteration": 2.8066813945770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131427, + "balance_loss_mlp": 1.11263931, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.11428683327792583, + "language_loss": 0.86483157, + "learning_rate": 0.0007690802855558658, + "loss": 0.87614584, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.18762207, + "step": 1765, + "time_per_iteration": 2.9382381439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097374, + "balance_loss_mlp": 1.08335495, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.038046821471630334, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77472329, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.140625, + "step": 1766, + "time_per_iteration": 4.939141750335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131665, + "balance_loss_mlp": 1.11155438, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.1972001158351392, + "language_loss": 0.89103919, + "learning_rate": 0.0007685549146641262, + "loss": 0.90235579, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.20117188, + "step": 1767, + "time_per_iteration": 2.596677780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113815, + "balance_loss_mlp": 1.11898088, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.0754052007703104, + "language_loss": 0.87994409, + "learning_rate": 0.0007682920727738579, + "loss": 0.89132559, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.19152832, + "step": 1768, + "time_per_iteration": 2.572606325149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011476, + "balance_loss_mlp": 1.12763298, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.09008834675764238, + "language_loss": 0.84476101, + "learning_rate": 0.000768029126723369, + "loss": 0.85623699, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.19958496, + "step": 1769, + "time_per_iteration": 2.517974615097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117614, + "balance_loss_mlp": 1.15621972, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.08324416055939475, + "language_loss": 0.81926113, + "learning_rate": 0.0007677660766147447, + "loss": 0.83102256, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.19909668, + "step": 1770, + "time_per_iteration": 2.525979518890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.0996542, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.058076344856887535, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73584139, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.13574219, + "step": 1771, + "time_per_iteration": 4.954227924346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208192, + "balance_loss_mlp": 1.18773556, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.12544773614524246, + "language_loss": 0.79168922, + "learning_rate": 0.0007672396646316306, + "loss": 0.80377114, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.20446777, + "step": 1772, + "time_per_iteration": 2.5573487281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184404, + "balance_loss_mlp": 1.1633631, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.0812632702006711, + "language_loss": 0.80576169, + "learning_rate": 0.000766976302961512, + "loss": 0.81760573, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.21057129, + "step": 1773, + "time_per_iteration": 2.9981236457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174902, + "balance_loss_mlp": 1.15440965, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.08509012237921207, + "language_loss": 0.81078374, + "learning_rate": 0.0007667128376420003, + "loss": 0.82253277, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.20495605, + "step": 1774, + "time_per_iteration": 2.6422817707061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141783, + "balance_loss_mlp": 1.12135017, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.07609688435085656, + "language_loss": 0.84329826, + "learning_rate": 0.0007664492687753817, + "loss": 0.85471606, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.2043457, + "step": 1775, + "time_per_iteration": 2.719444513320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133962, + "balance_loss_mlp": 1.11357749, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.0684007600896635, + "language_loss": 0.81250805, + "learning_rate": 0.000766185596463983, + "loss": 0.82384765, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.20397949, + "step": 1776, + "time_per_iteration": 2.641289472579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118215, + "balance_loss_mlp": 1.09844995, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.08848921826202948, + "language_loss": 0.76858222, + "learning_rate": 0.0007659218208101706, + "loss": 0.77976441, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.19750977, + "step": 1777, + "time_per_iteration": 3.121042490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111411, + "balance_loss_mlp": 1.09507275, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.06446172596419028, + "language_loss": 0.84679043, + "learning_rate": 0.0007656579419163515, + "loss": 0.85793149, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.19018555, + "step": 1778, + "time_per_iteration": 2.8044042587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115799, + "balance_loss_mlp": 1.09639132, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.08419061749659096, + "language_loss": 0.7684586, + "learning_rate": 0.0007653939598849724, + "loss": 0.77961665, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.19396973, + "step": 1779, + "time_per_iteration": 2.5383636951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090316, + "balance_loss_mlp": 1.07667828, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.04688573866990776, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83970523, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.13671875, + "step": 1780, + "time_per_iteration": 4.939146041870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_mlp": 1.0817349, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.09328427377426286, + "language_loss": 0.7993626, + "learning_rate": 0.000764865686819522, + "loss": 0.81036985, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.18969727, + "step": 1781, + "time_per_iteration": 3.0855140686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097786, + "balance_loss_mlp": 1.07818818, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.0784117519331498, + "language_loss": 0.85829425, + "learning_rate": 0.0007646013959905449, + "loss": 0.86927211, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.19592285, + "step": 1782, + "time_per_iteration": 2.6008715629577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094184, + "balance_loss_mlp": 1.07484865, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.10020930760951015, + "language_loss": 0.80767882, + "learning_rate": 0.0007643370024341949, + "loss": 0.81862062, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.19311523, + "step": 1783, + "time_per_iteration": 3.1744794845581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093606, + "balance_loss_mlp": 1.0741868, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.06177623901241128, + "language_loss": 0.82775044, + "learning_rate": 0.0007640725062531195, + "loss": 0.83868653, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.19396973, + "step": 1784, + "time_per_iteration": 2.5207273960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095739, + "balance_loss_mlp": 1.07624829, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.07609738057692413, + "language_loss": 0.86137176, + "learning_rate": 0.0007638079075500047, + "loss": 0.87232918, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.19482422, + "step": 1785, + "time_per_iteration": 2.6027305126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_mlp": 1.02909327, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.02730093024075542, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76222348, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.12597656, + "step": 1786, + "time_per_iteration": 4.981709718704224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123604, + "balance_loss_mlp": 1.10412502, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.0828485615256838, + "language_loss": 0.82775986, + "learning_rate": 0.0007632784029886026, + "loss": 0.83899587, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.19470215, + "step": 1787, + "time_per_iteration": 2.6825647354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121547, + "balance_loss_mlp": 1.10167432, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.06541046205818803, + "language_loss": 0.84959292, + "learning_rate": 0.0007630134973358873, + "loss": 0.86080837, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.1986084, + "step": 1788, + "time_per_iteration": 3.0164642333984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112959, + "balance_loss_mlp": 1.11006355, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.07128557935976318, + "language_loss": 0.86626679, + "learning_rate": 0.0007627484895722763, + "loss": 0.8775627, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.1953125, + "step": 1789, + "time_per_iteration": 2.7014718055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134771, + "balance_loss_mlp": 1.11494648, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.08217230393347356, + "language_loss": 0.80139697, + "learning_rate": 0.0007624833798006552, + "loss": 0.81274474, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.19812012, + "step": 1790, + "time_per_iteration": 3.0889768600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130993, + "balance_loss_mlp": 1.11054873, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.08452412416329605, + "language_loss": 0.83807981, + "learning_rate": 0.0007622181681239483, + "loss": 0.84938967, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.20446777, + "step": 1791, + "time_per_iteration": 2.668236017227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126724, + "balance_loss_mlp": 1.10656524, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.06876002435899166, + "language_loss": 0.84450197, + "learning_rate": 0.0007619528546451202, + "loss": 0.85576922, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.20153809, + "step": 1792, + "time_per_iteration": 2.820676326751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10096347, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.0839228841992506, + "language_loss": 0.83888298, + "learning_rate": 0.0007616874394671745, + "loss": 0.8500948, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.20214844, + "step": 1793, + "time_per_iteration": 3.339189291000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121499, + "balance_loss_mlp": 1.10161519, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.08136840273622996, + "language_loss": 0.84983474, + "learning_rate": 0.0007614219226931547, + "loss": 0.86104971, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.19873047, + "step": 1794, + "time_per_iteration": 2.7227368354797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129707, + "balance_loss_mlp": 1.10958409, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.09590444489475901, + "language_loss": 0.84532511, + "learning_rate": 0.0007611563044261435, + "loss": 0.85662222, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.2010498, + "step": 1795, + "time_per_iteration": 2.546884536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125004, + "balance_loss_mlp": 1.10475039, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.0814281657370807, + "language_loss": 0.86456835, + "learning_rate": 0.0007608905847692631, + "loss": 0.87581837, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.20251465, + "step": 1796, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116976, + "balance_loss_mlp": 1.0972116, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.08445523119956015, + "language_loss": 0.86433315, + "learning_rate": 0.0007606247638256749, + "loss": 0.87550294, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.19750977, + "step": 1797, + "time_per_iteration": 2.8908944129943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_mlp": 1.03016257, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.0206101242754925, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79212284, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11816406, + "step": 1798, + "time_per_iteration": 4.959855079650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037793, + "balance_loss_mlp": 1.02591991, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.018708496865608985, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80365002, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11865234, + "step": 1799, + "time_per_iteration": 4.7935545444488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129126, + "balance_loss_mlp": 1.10934877, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.08973397272803926, + "language_loss": 0.85623878, + "learning_rate": 0.0007598266943068686, + "loss": 0.86753011, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.19763184, + "step": 1800, + "time_per_iteration": 2.8019869327545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112277, + "balance_loss_mlp": 1.10252821, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.0674943248051881, + "language_loss": 0.83542264, + "learning_rate": 0.0007595604692488507, + "loss": 0.84665036, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.20239258, + "step": 1801, + "time_per_iteration": 2.6360082626342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126397, + "balance_loss_mlp": 1.10636973, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.06909713253641608, + "language_loss": 0.82839429, + "learning_rate": 0.0007592941434205215, + "loss": 0.83965826, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.20031738, + "step": 1802, + "time_per_iteration": 2.8132333755493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015999, + "balance_loss_mlp": 1.0041256, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.010015114509230977, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74587059, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.11865234, + "step": 1803, + "time_per_iteration": 5.086339950561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104785, + "balance_loss_mlp": 1.08531845, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07426270940157376, + "language_loss": 0.80069757, + "learning_rate": 0.0007587611898665566, + "loss": 0.81174541, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.19458008, + "step": 1804, + "time_per_iteration": 3.092641592025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110039, + "balance_loss_mlp": 1.0910604, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.07581928055471668, + "language_loss": 0.81691384, + "learning_rate": 0.0007584945623478315, + "loss": 0.82801425, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.18969727, + "step": 1805, + "time_per_iteration": 2.846060037612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104021, + "balance_loss_mlp": 1.08541238, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.07473751481828116, + "language_loss": 0.80751228, + "learning_rate": 0.000758227834472617, + "loss": 0.81855249, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.18603516, + "step": 1806, + "time_per_iteration": 3.0771524906158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111397, + "balance_loss_mlp": 1.09499145, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.07117533522239076, + "language_loss": 0.77160984, + "learning_rate": 0.0007579610063444664, + "loss": 0.78274959, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.18969727, + "step": 1807, + "time_per_iteration": 2.765228509902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104162, + "balance_loss_mlp": 1.08548236, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.0766054024965894, + "language_loss": 0.8690778, + "learning_rate": 0.0007576940780669712, + "loss": 0.88011932, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.18664551, + "step": 1808, + "time_per_iteration": 3.279489278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123806, + "balance_loss_mlp": 1.10510182, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.07904928967380129, + "language_loss": 0.84151316, + "learning_rate": 0.0007574270497437624, + "loss": 0.85275126, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.18701172, + "step": 1809, + "time_per_iteration": 2.987900733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122089, + "balance_loss_mlp": 1.10336101, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.06962767524782593, + "language_loss": 0.87729847, + "learning_rate": 0.000757159921478509, + "loss": 0.88851929, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.18725586, + "step": 1810, + "time_per_iteration": 2.8426477909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055659, + "balance_loss_mlp": 1.04316616, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.023331363727236345, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75506294, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.125, + "step": 1811, + "time_per_iteration": 4.784373044967651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.12720931, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.0794635065049281, + "language_loss": 0.87678373, + "learning_rate": 0.0007566253655367423, + "loss": 0.88824427, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.18823242, + "step": 1812, + "time_per_iteration": 2.649627685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151883, + "balance_loss_mlp": 1.13314283, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.08948054068367119, + "language_loss": 0.89612782, + "learning_rate": 0.000756357938067762, + "loss": 0.90764666, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.18737793, + "step": 1813, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151317, + "balance_loss_mlp": 1.13220787, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.08322597535257283, + "language_loss": 0.82610291, + "learning_rate": 0.0007560904110718033, + "loss": 0.83761609, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.19104004, + "step": 1814, + "time_per_iteration": 3.2898061275482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124529, + "balance_loss_mlp": 1.10556281, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.08612147208900138, + "language_loss": 0.8345058, + "learning_rate": 0.0007558227846527297, + "loss": 0.84575117, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.1895752, + "step": 1815, + "time_per_iteration": 2.9130759239196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123683, + "balance_loss_mlp": 1.10491991, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.09988459790630169, + "language_loss": 0.83118773, + "learning_rate": 0.0007555550589144429, + "loss": 0.84242463, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.1875, + "step": 1816, + "time_per_iteration": 2.4752960205078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117728, + "balance_loss_mlp": 1.09804606, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.07751955343806295, + "language_loss": 0.84176993, + "learning_rate": 0.000755287233960883, + "loss": 0.85294718, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.19665527, + "step": 1817, + "time_per_iteration": 2.597585439682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098926, + "balance_loss_mlp": 1.07926798, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.08165217026076037, + "language_loss": 0.7746554, + "learning_rate": 0.0007550193098960292, + "loss": 0.78564465, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.19641113, + "step": 1818, + "time_per_iteration": 2.9257001876831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092408, + "balance_loss_mlp": 1.07195151, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.0691698669989475, + "language_loss": 0.85927546, + "learning_rate": 0.0007547512868238988, + "loss": 0.87019956, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.20446777, + "step": 1819, + "time_per_iteration": 3.1347925662994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108081, + "balance_loss_mlp": 1.06050837, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.09514158419007644, + "language_loss": 0.83275855, + "learning_rate": 0.0007544831648485473, + "loss": 0.84356666, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.20300293, + "step": 1820, + "time_per_iteration": 2.7215232849121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108343, + "balance_loss_mlp": 1.06210327, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.1073780855917388, + "language_loss": 0.81151676, + "learning_rate": 0.0007542149440740694, + "loss": 0.82235104, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.21350098, + "step": 1821, + "time_per_iteration": 2.6931724548339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080905, + "balance_loss_mlp": 1.05936432, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.1562262811893555, + "language_loss": 0.85392433, + "learning_rate": 0.000753946624604597, + "loss": 0.86473334, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.2154541, + "step": 1822, + "time_per_iteration": 2.7700464725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072173, + "balance_loss_mlp": 1.05028629, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.08427952696401207, + "language_loss": 0.87906677, + "learning_rate": 0.0007536782065443015, + "loss": 0.88978851, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.21899414, + "step": 1823, + "time_per_iteration": 2.618863105773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084137, + "balance_loss_mlp": 1.06188059, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.15781529291863344, + "language_loss": 0.75435269, + "learning_rate": 0.0007534096899973919, + "loss": 0.76519406, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.22253418, + "step": 1824, + "time_per_iteration": 2.5891709327697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086563, + "balance_loss_mlp": 1.06396103, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.09040612359289192, + "language_loss": 0.82346433, + "learning_rate": 0.0007531410750681154, + "loss": 0.83432996, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.22595215, + "step": 1825, + "time_per_iteration": 2.810972213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111588, + "balance_loss_mlp": 1.09455299, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.07292466952983544, + "language_loss": 0.86399037, + "learning_rate": 0.0007528723618607575, + "loss": 0.87514913, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.21325684, + "step": 1826, + "time_per_iteration": 3.474869966506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133984, + "balance_loss_mlp": 1.11370611, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.08837862995453269, + "language_loss": 0.82404733, + "learning_rate": 0.0007526035504796422, + "loss": 0.83538717, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.20275879, + "step": 1827, + "time_per_iteration": 2.8155739307403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150633, + "balance_loss_mlp": 1.13051069, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.10569988158542801, + "language_loss": 0.86735702, + "learning_rate": 0.0007523346410291312, + "loss": 0.87886333, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.20117188, + "step": 1828, + "time_per_iteration": 2.788748025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147917, + "balance_loss_mlp": 1.12691236, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.11718864183712574, + "language_loss": 0.84880495, + "learning_rate": 0.0007520656336136245, + "loss": 0.86028415, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.21020508, + "step": 1829, + "time_per_iteration": 2.995258331298828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144006, + "balance_loss_mlp": 1.12407422, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.07752679685559628, + "language_loss": 0.87776285, + "learning_rate": 0.0007517965283375599, + "loss": 0.88920295, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.19921875, + "step": 1830, + "time_per_iteration": 2.9131507873535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137113, + "balance_loss_mlp": 1.11694324, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.0712879308552529, + "language_loss": 0.89257503, + "learning_rate": 0.0007515273253054132, + "loss": 0.90394616, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.20166016, + "step": 1831, + "time_per_iteration": 2.7115964889526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144172, + "balance_loss_mlp": 1.12451458, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.08358912815272257, + "language_loss": 0.82353687, + "learning_rate": 0.0007512580246216988, + "loss": 0.83497858, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.19665527, + "step": 1832, + "time_per_iteration": 2.7660555839538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137261, + "balance_loss_mlp": 1.11740053, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.08932198209233742, + "language_loss": 0.84907162, + "learning_rate": 0.000750988626390968, + "loss": 0.86044419, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.19848633, + "step": 1833, + "time_per_iteration": 2.6142635345458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135258, + "balance_loss_mlp": 1.11577928, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.0712191508208571, + "language_loss": 0.84978765, + "learning_rate": 0.0007507191307178108, + "loss": 0.86114025, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.19470215, + "step": 1834, + "time_per_iteration": 2.8424935340881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124766, + "balance_loss_mlp": 1.10512066, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.12990441969076433, + "language_loss": 0.74422562, + "learning_rate": 0.0007504495377068543, + "loss": 0.75547332, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.19628906, + "step": 1835, + "time_per_iteration": 2.8079066276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129638, + "balance_loss_mlp": 1.11026645, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09183665723882013, + "language_loss": 0.81276792, + "learning_rate": 0.0007501798474627642, + "loss": 0.82406431, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.19360352, + "step": 1836, + "time_per_iteration": 2.952760934829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120111, + "balance_loss_mlp": 1.10109687, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.11181895830758388, + "language_loss": 0.83497429, + "learning_rate": 0.0007499100600902433, + "loss": 0.84617543, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.18994141, + "step": 1837, + "time_per_iteration": 3.0599989891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112032, + "balance_loss_mlp": 1.09237409, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.08618493176537427, + "language_loss": 0.84243816, + "learning_rate": 0.0007496401756940324, + "loss": 0.85355854, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.1965332, + "step": 1838, + "time_per_iteration": 2.7366483211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111194, + "balance_loss_mlp": 1.09217548, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.1107744559232423, + "language_loss": 0.82783937, + "learning_rate": 0.0007493701943789098, + "loss": 0.8389588, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.19750977, + "step": 1839, + "time_per_iteration": 2.780212640762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.08844888, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07955024359155173, + "language_loss": 0.82622725, + "learning_rate": 0.000749100116249692, + "loss": 0.83730406, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.19213867, + "step": 1840, + "time_per_iteration": 2.59558367729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110996, + "balance_loss_mlp": 1.09009957, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.09363875008830587, + "language_loss": 0.86041892, + "learning_rate": 0.0007488299414112321, + "loss": 0.87151849, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.19848633, + "step": 1841, + "time_per_iteration": 2.625204563140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112719, + "balance_loss_mlp": 1.0932045, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.07784236461393054, + "language_loss": 0.77495539, + "learning_rate": 0.0007485596699684215, + "loss": 0.78608257, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.19506836, + "step": 1842, + "time_per_iteration": 2.889179229736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110913, + "balance_loss_mlp": 1.0890193, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.0730470956511186, + "language_loss": 0.85287404, + "learning_rate": 0.000748289302026189, + "loss": 0.86396539, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.2010498, + "step": 1843, + "time_per_iteration": 2.8508758544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117167, + "balance_loss_mlp": 1.09693718, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.08361202953284802, + "language_loss": 0.85558116, + "learning_rate": 0.0007480188376895004, + "loss": 0.8667528, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.20227051, + "step": 1844, + "time_per_iteration": 3.0799713134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058665, + "balance_loss_mlp": 1.04655302, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.036648944322370085, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74870002, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.12109375, + "step": 1845, + "time_per_iteration": 4.911001205444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151869, + "balance_loss_mlp": 1.1320442, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08485938300722028, + "language_loss": 0.78214371, + "learning_rate": 0.0007474776202528074, + "loss": 0.79366243, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.19824219, + "step": 1846, + "time_per_iteration": 3.0216140747070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161099, + "balance_loss_mlp": 1.1411432, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08254469932015057, + "language_loss": 0.81304067, + "learning_rate": 0.000747206867362922, + "loss": 0.82465172, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.19946289, + "step": 1847, + "time_per_iteration": 3.090902090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160243, + "balance_loss_mlp": 1.13996506, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.07042821685917994, + "language_loss": 0.83881712, + "learning_rate": 0.0007469360184988194, + "loss": 0.85041958, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.20275879, + "step": 1848, + "time_per_iteration": 2.834099292755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164664, + "balance_loss_mlp": 1.14419615, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08278620993607219, + "language_loss": 0.86537004, + "learning_rate": 0.0007466650737656518, + "loss": 0.87701666, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.20471191, + "step": 1849, + "time_per_iteration": 2.6372272968292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164893, + "balance_loss_mlp": 1.14411473, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.1003606576453008, + "language_loss": 0.90052241, + "learning_rate": 0.0007463940332686098, + "loss": 0.9121713, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.20788574, + "step": 1850, + "time_per_iteration": 2.485778331756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138299, + "balance_loss_mlp": 1.11759257, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.07662996022318802, + "language_loss": 0.83963442, + "learning_rate": 0.0007461228971129205, + "loss": 0.85101742, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.20715332, + "step": 1851, + "time_per_iteration": 2.9709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119617, + "balance_loss_mlp": 1.09905326, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.09722709387095821, + "language_loss": 0.8525731, + "learning_rate": 0.0007458516654038483, + "loss": 0.86376923, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.20568848, + "step": 1852, + "time_per_iteration": 2.678692579269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122543, + "balance_loss_mlp": 1.10156226, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.11064851070237179, + "language_loss": 0.86565018, + "learning_rate": 0.0007455803382466946, + "loss": 0.87687564, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.20983887, + "step": 1853, + "time_per_iteration": 2.8357412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118205, + "balance_loss_mlp": 1.0977726, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.07486516106338226, + "language_loss": 0.87089902, + "learning_rate": 0.0007453089157467979, + "loss": 0.88208103, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.2043457, + "step": 1854, + "time_per_iteration": 2.808497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110339, + "balance_loss_mlp": 1.08300531, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.0938349401282225, + "language_loss": 0.82008994, + "learning_rate": 0.0007450373980095341, + "loss": 0.83112389, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.20385742, + "step": 1855, + "time_per_iteration": 3.127755641937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102136, + "balance_loss_mlp": 1.08226347, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.07357008991516471, + "language_loss": 0.86741251, + "learning_rate": 0.0007447657851403155, + "loss": 0.87843382, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.1986084, + "step": 1856, + "time_per_iteration": 2.662548780441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104026, + "balance_loss_mlp": 1.08421302, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.09605793543255373, + "language_loss": 0.78325486, + "learning_rate": 0.0007444940772445915, + "loss": 0.79429507, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.19812012, + "step": 1857, + "time_per_iteration": 2.7455575466156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098079, + "balance_loss_mlp": 1.07937515, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.09380435326028273, + "language_loss": 0.80025625, + "learning_rate": 0.0007442222744278484, + "loss": 0.81123704, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.18688965, + "step": 1858, + "time_per_iteration": 2.7159781455993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110587, + "balance_loss_mlp": 1.08752322, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.07197173632554923, + "language_loss": 0.8371805, + "learning_rate": 0.0007439503767956099, + "loss": 0.84823918, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.18347168, + "step": 1859, + "time_per_iteration": 2.7746405601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129586, + "balance_loss_mlp": 1.11757004, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.053548748661834844, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80801189, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12011719, + "step": 1860, + "time_per_iteration": 4.952972412109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141933, + "balance_loss_mlp": 1.12300301, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.07146029040980974, + "language_loss": 0.86061597, + "learning_rate": 0.000743406297506922, + "loss": 0.87203526, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.18920898, + "step": 1861, + "time_per_iteration": 2.788799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155472, + "balance_loss_mlp": 1.13686371, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.08496046226468609, + "language_loss": 0.83806807, + "learning_rate": 0.0007431341160617031, + "loss": 0.84962279, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.18615723, + "step": 1862, + "time_per_iteration": 2.891972780227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153957, + "balance_loss_mlp": 1.13561106, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.08024798355603865, + "language_loss": 0.87945759, + "learning_rate": 0.0007428618402234491, + "loss": 0.89099711, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.18347168, + "step": 1863, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157244, + "balance_loss_mlp": 1.13868272, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.10629387801358743, + "language_loss": 0.79862851, + "learning_rate": 0.0007425894700978668, + "loss": 0.81020093, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.18579102, + "step": 1864, + "time_per_iteration": 2.80774188041687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153191, + "balance_loss_mlp": 1.13476086, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.07530240473897643, + "language_loss": 0.79704821, + "learning_rate": 0.0007423170057906996, + "loss": 0.80858016, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.1842041, + "step": 1865, + "time_per_iteration": 3.8680994510650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145718, + "balance_loss_mlp": 1.12701416, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.09184761749378255, + "language_loss": 0.86028153, + "learning_rate": 0.0007420444474077275, + "loss": 0.87173867, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.18688965, + "step": 1866, + "time_per_iteration": 2.5685620307922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113899, + "balance_loss_mlp": 1.12003553, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.09893409220555562, + "language_loss": 0.89461643, + "learning_rate": 0.0007417717950547671, + "loss": 0.90600634, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.18945312, + "step": 1867, + "time_per_iteration": 2.671124219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107889, + "balance_loss_mlp": 1.06611049, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.038408778239575524, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77075499, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.12792969, + "step": 1868, + "time_per_iteration": 4.9185333251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122572, + "balance_loss_mlp": 1.10416651, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.07553494616843248, + "language_loss": 0.84798276, + "learning_rate": 0.0007412262088623299, + "loss": 0.85920852, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.18408203, + "step": 1869, + "time_per_iteration": 2.7392468452453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.10186732, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.08536155576366684, + "language_loss": 0.79418659, + "learning_rate": 0.0007409532752346684, + "loss": 0.80538857, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.18334961, + "step": 1870, + "time_per_iteration": 2.696479082107544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119342, + "balance_loss_mlp": 1.10078073, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.06482127106924716, + "language_loss": 0.88322479, + "learning_rate": 0.0007406802480606491, + "loss": 0.89441818, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.18566895, + "step": 1871, + "time_per_iteration": 2.636009931564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125376, + "balance_loss_mlp": 1.1068871, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.08328980109467413, + "language_loss": 0.90382409, + "learning_rate": 0.0007404071274462707, + "loss": 0.91507781, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.18493652, + "step": 1872, + "time_per_iteration": 2.6033034324645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126247, + "balance_loss_mlp": 1.10767388, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.08507135616363887, + "language_loss": 0.83713084, + "learning_rate": 0.0007401339134975682, + "loss": 0.84839332, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.18579102, + "step": 1873, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124337, + "balance_loss_mlp": 1.1061461, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.08710024588150622, + "language_loss": 0.8447001, + "learning_rate": 0.0007398606063206122, + "loss": 0.8559435, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.18200684, + "step": 1874, + "time_per_iteration": 2.6102805137634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118797, + "balance_loss_mlp": 1.1010226, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.09331990326127676, + "language_loss": 0.78271621, + "learning_rate": 0.0007395872060215101, + "loss": 0.79390419, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.17773438, + "step": 1875, + "time_per_iteration": 2.6235439777374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125165, + "balance_loss_mlp": 1.10746276, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.08705098996186143, + "language_loss": 0.8794744, + "learning_rate": 0.0007393137127064056, + "loss": 0.89072609, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.17724609, + "step": 1876, + "time_per_iteration": 2.693005323410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131992, + "balance_loss_mlp": 1.11434913, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.07970542462566557, + "language_loss": 0.84223264, + "learning_rate": 0.0007390401264814779, + "loss": 0.85355258, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.17675781, + "step": 1877, + "time_per_iteration": 2.6267154216766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144153, + "balance_loss_mlp": 1.12600899, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.11052243492945069, + "language_loss": 0.84164327, + "learning_rate": 0.0007387664474529427, + "loss": 0.8530848, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.18151855, + "step": 1878, + "time_per_iteration": 2.6380414962768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114366, + "balance_loss_mlp": 1.12561202, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.06785614970382317, + "language_loss": 0.91167343, + "learning_rate": 0.0007384926757270518, + "loss": 0.92311001, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.18054199, + "step": 1879, + "time_per_iteration": 2.6760640144348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148828, + "balance_loss_mlp": 1.13057721, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.07379174248702317, + "language_loss": 0.79513329, + "learning_rate": 0.0007382188114100924, + "loss": 0.80662155, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.18249512, + "step": 1880, + "time_per_iteration": 2.980865716934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140419, + "balance_loss_mlp": 1.12196517, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.08452869991753884, + "language_loss": 0.81477511, + "learning_rate": 0.0007379448546083884, + "loss": 0.82617927, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.18457031, + "step": 1881, + "time_per_iteration": 2.9168553352355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122618, + "balance_loss_mlp": 1.10411692, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.07446388495521607, + "language_loss": 0.87973779, + "learning_rate": 0.0007376708054282992, + "loss": 0.89096403, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.18481445, + "step": 1882, + "time_per_iteration": 2.987179756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115299, + "balance_loss_mlp": 1.09675002, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.06334344400813875, + "language_loss": 0.83726645, + "learning_rate": 0.0007373966639762201, + "loss": 0.84841949, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.18530273, + "step": 1883, + "time_per_iteration": 2.611685276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107737, + "balance_loss_mlp": 1.08896196, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.34913247510054485, + "language_loss": 0.88361132, + "learning_rate": 0.0007371224303585822, + "loss": 0.89468867, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.18762207, + "step": 1884, + "time_per_iteration": 2.5775835514068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_mlp": 1.04219282, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.031056792089232132, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81412423, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.13183594, + "step": 1885, + "time_per_iteration": 4.700505256652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125656, + "balance_loss_mlp": 1.10721421, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.08679320645386224, + "language_loss": 0.82572937, + "learning_rate": 0.0007365736870525335, + "loss": 0.83698595, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.18457031, + "step": 1886, + "time_per_iteration": 2.859740734100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129292, + "balance_loss_mlp": 1.11139846, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.08795223769340633, + "language_loss": 0.82107997, + "learning_rate": 0.000736299177577164, + "loss": 0.8323729, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.17907715, + "step": 1887, + "time_per_iteration": 2.5841786861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130014, + "balance_loss_mlp": 1.11198997, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.08315005772253937, + "language_loss": 0.83388066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84518075, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.18029785, + "step": 1888, + "time_per_iteration": 2.665529489517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.12729573, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.088670630002398, + "language_loss": 0.89456129, + "learning_rate": 0.0007357498835146039, + "loss": 0.90601313, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.17895508, + "step": 1889, + "time_per_iteration": 2.8847129344940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156911, + "balance_loss_mlp": 1.13911295, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.10357296063524607, + "language_loss": 0.87070376, + "learning_rate": 0.0007354750991406684, + "loss": 0.8822729, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.17810059, + "step": 1890, + "time_per_iteration": 2.723062753677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159624, + "balance_loss_mlp": 1.14133692, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.08144896750451855, + "language_loss": 0.80397975, + "learning_rate": 0.0007352002233471919, + "loss": 0.81557596, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.18310547, + "step": 1891, + "time_per_iteration": 2.6574442386627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175761, + "balance_loss_mlp": 1.15818954, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.12092525276023756, + "language_loss": 0.79267627, + "learning_rate": 0.0007349252562408906, + "loss": 0.80443388, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.17590332, + "step": 1892, + "time_per_iteration": 2.7125816345214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180182, + "balance_loss_mlp": 1.16231263, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.10164191197483487, + "language_loss": 0.81473255, + "learning_rate": 0.0007346501979285158, + "loss": 0.82653439, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.17883301, + "step": 1893, + "time_per_iteration": 2.902371406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069378, + "balance_loss_mlp": 1.05621696, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.029928407037273664, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.8160848, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.13183594, + "step": 1894, + "time_per_iteration": 4.841979265213013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166903, + "balance_loss_mlp": 1.14858055, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.079124644393009, + "language_loss": 0.85946983, + "learning_rate": 0.0007340998081127308, + "loss": 0.87113881, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.18322754, + "step": 1895, + "time_per_iteration": 2.7981679439544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149308, + "balance_loss_mlp": 1.13090205, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.08117131709807607, + "language_loss": 0.90645039, + "learning_rate": 0.0007338244768230007, + "loss": 0.91794348, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.18408203, + "step": 1896, + "time_per_iteration": 2.821958541870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131855, + "balance_loss_mlp": 1.11337733, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.06648659114179455, + "language_loss": 0.88624144, + "learning_rate": 0.0007335490547545578, + "loss": 0.89756, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.18469238, + "step": 1897, + "time_per_iteration": 3.0718753337860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115822, + "balance_loss_mlp": 1.09670115, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06783762736794967, + "language_loss": 0.82265627, + "learning_rate": 0.0007332735420143308, + "loss": 0.8338145, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.19091797, + "step": 1898, + "time_per_iteration": 2.7864439487457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103004, + "balance_loss_mlp": 1.08431149, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.10561663647405507, + "language_loss": 0.86410689, + "learning_rate": 0.0007329979387092826, + "loss": 0.87513697, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.18664551, + "step": 1899, + "time_per_iteration": 2.6032557487487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099553, + "balance_loss_mlp": 1.08087325, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.0619875823145499, + "language_loss": 0.83878422, + "learning_rate": 0.0007327222449464124, + "loss": 0.84977973, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.18676758, + "step": 1900, + "time_per_iteration": 3.2741036415100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103796, + "balance_loss_mlp": 1.08450782, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07856096432694096, + "language_loss": 0.885158, + "learning_rate": 0.0007324464608327538, + "loss": 0.89619601, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.19287109, + "step": 1901, + "time_per_iteration": 2.678788900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094923, + "balance_loss_mlp": 1.07613552, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.117877128585243, + "language_loss": 0.88101745, + "learning_rate": 0.0007321705864753758, + "loss": 0.8919667, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.18774414, + "step": 1902, + "time_per_iteration": 2.746980905532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104989, + "balance_loss_mlp": 1.08645177, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.07495737234387592, + "language_loss": 0.83840346, + "learning_rate": 0.0007318946219813823, + "loss": 0.84945333, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.18530273, + "step": 1903, + "time_per_iteration": 3.0181055068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113516, + "balance_loss_mlp": 1.09416842, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.08147269799104237, + "language_loss": 0.89553183, + "learning_rate": 0.000731618567457912, + "loss": 0.90666699, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.19335938, + "step": 1904, + "time_per_iteration": 2.656008243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112169, + "balance_loss_mlp": 1.10242581, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.09666599698156476, + "language_loss": 0.86684108, + "learning_rate": 0.000731342423012139, + "loss": 0.87805796, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.19250488, + "step": 1905, + "time_per_iteration": 3.0675060749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130977, + "balance_loss_mlp": 1.11136723, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.07693711099894461, + "language_loss": 0.82752407, + "learning_rate": 0.0007310661887512722, + "loss": 0.83883387, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.19616699, + "step": 1906, + "time_per_iteration": 3.058940887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121537, + "balance_loss_mlp": 1.10290504, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.08447106182036945, + "language_loss": 0.8153969, + "learning_rate": 0.0007307898647825549, + "loss": 0.82661223, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.1862793, + "step": 1907, + "time_per_iteration": 2.6844449043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123702, + "balance_loss_mlp": 1.10468769, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.09351646457276126, + "language_loss": 0.89255947, + "learning_rate": 0.0007305134512132659, + "loss": 0.90379649, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.18994141, + "step": 1908, + "time_per_iteration": 2.709672451019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110565, + "balance_loss_mlp": 1.09136009, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.10593037141853442, + "language_loss": 0.82889271, + "learning_rate": 0.0007302369481507183, + "loss": 0.83999836, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.19189453, + "step": 1909, + "time_per_iteration": 2.521117687225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042583, + "balance_loss_mlp": 1.03214002, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.025696927495133286, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81004339, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10449219, + "step": 1910, + "time_per_iteration": 4.8944993019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109603, + "balance_loss_mlp": 1.09143519, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.13197556892024634, + "language_loss": 0.85332, + "learning_rate": 0.000729683673975274, + "loss": 0.864416, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.18164062, + "step": 1911, + "time_per_iteration": 2.6855151653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113177, + "balance_loss_mlp": 1.09509254, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.05917682500902713, + "language_loss": 0.82910979, + "learning_rate": 0.0007294069030771774, + "loss": 0.84024155, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.1809082, + "step": 1912, + "time_per_iteration": 3.696908712387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119134, + "balance_loss_mlp": 1.10070467, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.2785371066278341, + "language_loss": 0.90901196, + "learning_rate": 0.0007291300431154224, + "loss": 0.92020327, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.18432617, + "step": 1913, + "time_per_iteration": 2.666469097137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066964, + "balance_loss_mlp": 1.05699825, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.035296075115353785, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71456701, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.09960938, + "step": 1914, + "time_per_iteration": 5.019417762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176446, + "balance_loss_mlp": 1.1579566, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.09302167105112862, + "language_loss": 0.79388487, + "learning_rate": 0.0007285760564309179, + "loss": 0.80564928, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.18493652, + "step": 1915, + "time_per_iteration": 3.112898826599121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204501, + "balance_loss_mlp": 1.18492651, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.10352341742670183, + "language_loss": 0.84420514, + "learning_rate": 0.0007282989299232448, + "loss": 0.85625011, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.19567871, + "step": 1916, + "time_per_iteration": 3.0435094833374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222721, + "balance_loss_mlp": 1.20364785, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.07568711881104075, + "language_loss": 0.83658814, + "learning_rate": 0.0007280217147820668, + "loss": 0.84881544, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.19042969, + "step": 1917, + "time_per_iteration": 2.618802547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214339, + "balance_loss_mlp": 1.19502735, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06430089788192027, + "language_loss": 0.78882575, + "learning_rate": 0.0007277444111150079, + "loss": 0.80096912, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.19299316, + "step": 1918, + "time_per_iteration": 2.705514669418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212887, + "balance_loss_mlp": 1.19302678, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.1316988542142886, + "language_loss": 0.84107184, + "learning_rate": 0.0007274670190297272, + "loss": 0.85320067, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.19848633, + "step": 1919, + "time_per_iteration": 2.643360137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216839, + "balance_loss_mlp": 1.19697857, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.08424482176176182, + "language_loss": 0.82129955, + "learning_rate": 0.0007271895386339179, + "loss": 0.83346796, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.19848633, + "step": 1920, + "time_per_iteration": 2.7766342163085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209662, + "balance_loss_mlp": 1.1898967, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.08336147686301533, + "language_loss": 0.83142531, + "learning_rate": 0.0007269119700353073, + "loss": 0.84352195, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.19763184, + "step": 1921, + "time_per_iteration": 2.747455596923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217254, + "balance_loss_mlp": 1.19840705, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.06910916264284567, + "language_loss": 0.85129571, + "learning_rate": 0.0007266343133416571, + "loss": 0.86346817, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.18811035, + "step": 1922, + "time_per_iteration": 2.815875768661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107275, + "balance_loss_mlp": 1.09573579, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.04105564932095409, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78224194, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11523438, + "step": 1923, + "time_per_iteration": 4.86853289604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198125, + "balance_loss_mlp": 1.17899168, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.1110881339245658, + "language_loss": 0.84574348, + "learning_rate": 0.0007260787361004556, + "loss": 0.85772473, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.19128418, + "step": 1924, + "time_per_iteration": 2.580287456512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.0494777, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.023148070033358246, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74822283, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11279297, + "step": 1925, + "time_per_iteration": 4.9416913986206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175905, + "balance_loss_mlp": 1.15692663, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.06834955035904498, + "language_loss": 0.87516356, + "learning_rate": 0.0007255228077730903, + "loss": 0.8869226, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.18969727, + "step": 1926, + "time_per_iteration": 2.7211105823516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176426, + "balance_loss_mlp": 1.15784156, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.06265343241116231, + "language_loss": 0.81563449, + "learning_rate": 0.0007252447122218632, + "loss": 0.82739878, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.18579102, + "step": 1927, + "time_per_iteration": 3.151231527328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172974, + "balance_loss_mlp": 1.15472341, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.09894828359622332, + "language_loss": 0.88063776, + "learning_rate": 0.0007249665292228834, + "loss": 0.89236754, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.18261719, + "step": 1928, + "time_per_iteration": 2.702021360397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173693, + "balance_loss_mlp": 1.1554302, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.08781668530165682, + "language_loss": 0.83526367, + "learning_rate": 0.000724688258884151, + "loss": 0.8470006, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.18249512, + "step": 1929, + "time_per_iteration": 2.560795783996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162512, + "balance_loss_mlp": 1.14461839, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.07372740974795068, + "language_loss": 0.86387187, + "learning_rate": 0.0007244099013137002, + "loss": 0.87549698, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.17907715, + "step": 1930, + "time_per_iteration": 3.090304374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153317, + "balance_loss_mlp": 1.1359247, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.07369885077257772, + "language_loss": 0.88680494, + "learning_rate": 0.0007241314566195993, + "loss": 0.89833808, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.17407227, + "step": 1931, + "time_per_iteration": 3.2688889503479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140069, + "balance_loss_mlp": 1.12190151, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.1370251830388882, + "language_loss": 0.85430074, + "learning_rate": 0.0007238529249099496, + "loss": 0.86570138, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.18164062, + "step": 1932, + "time_per_iteration": 2.6766042709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056936, + "balance_loss_mlp": 1.04673159, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.03186229248255652, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78913808, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.10205078, + "step": 1933, + "time_per_iteration": 4.938454866409302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121071, + "balance_loss_mlp": 1.10291553, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.0858411932854742, + "language_loss": 0.80716681, + "learning_rate": 0.000723295600876581, + "loss": 0.81837749, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.1817627, + "step": 1934, + "time_per_iteration": 3.02756404876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127839, + "balance_loss_mlp": 1.10930252, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.07598025600252532, + "language_loss": 0.87578201, + "learning_rate": 0.0007230168087692344, + "loss": 0.8870604, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.18530273, + "step": 1935, + "time_per_iteration": 2.6842763423919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117422, + "balance_loss_mlp": 1.09867072, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.07638826910824403, + "language_loss": 0.82760978, + "learning_rate": 0.0007227379300790839, + "loss": 0.83878398, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.1875, + "step": 1936, + "time_per_iteration": 3.028691530227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126722, + "balance_loss_mlp": 1.10711217, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.1377793635442251, + "language_loss": 0.85613376, + "learning_rate": 0.0007224589649143997, + "loss": 0.86740094, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.19604492, + "step": 1937, + "time_per_iteration": 2.5564050674438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129561, + "balance_loss_mlp": 1.11017799, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.07798966628460335, + "language_loss": 0.80875593, + "learning_rate": 0.0007221799133834861, + "loss": 0.82005155, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.19360352, + "step": 1938, + "time_per_iteration": 2.6535797119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128571, + "balance_loss_mlp": 1.10997486, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.20771096851505863, + "language_loss": 0.81190193, + "learning_rate": 0.00072190077559468, + "loss": 0.82318759, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.18591309, + "step": 1939, + "time_per_iteration": 2.5281853675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119884, + "balance_loss_mlp": 1.10124016, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.07206730115622964, + "language_loss": 0.89147639, + "learning_rate": 0.0007216215516563527, + "loss": 0.90267527, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.18640137, + "step": 1940, + "time_per_iteration": 2.7357096672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112047, + "balance_loss_mlp": 1.1024456, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.09123969930056855, + "language_loss": 0.839782, + "learning_rate": 0.0007213422416769083, + "loss": 0.8509866, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.18029785, + "step": 1941, + "time_per_iteration": 2.6104605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119536, + "balance_loss_mlp": 1.10109389, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.07207094919122449, + "language_loss": 0.75049472, + "learning_rate": 0.0007210628457647849, + "loss": 0.76169002, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.18444824, + "step": 1942, + "time_per_iteration": 2.5805821418762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.11117733, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.10610035509825085, + "language_loss": 0.78376162, + "learning_rate": 0.000720783364028453, + "loss": 0.79505277, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.17956543, + "step": 1943, + "time_per_iteration": 2.780245542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140529, + "balance_loss_mlp": 1.1218369, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.07224730964326329, + "language_loss": 0.87268645, + "learning_rate": 0.0007205037965764177, + "loss": 0.88409173, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.18688965, + "step": 1944, + "time_per_iteration": 2.5735671520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151311, + "balance_loss_mlp": 1.13291705, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07659834869138271, + "language_loss": 0.8526088, + "learning_rate": 0.0007202241435172161, + "loss": 0.86412191, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.18408203, + "step": 1945, + "time_per_iteration": 2.7935566902160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126679, + "balance_loss_mlp": 1.10871434, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.3794268789868596, + "language_loss": 0.88413203, + "learning_rate": 0.0007199444049594198, + "loss": 0.89539886, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.17956543, + "step": 1946, + "time_per_iteration": 2.995715379714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127316, + "balance_loss_mlp": 1.10844493, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.0746444377907342, + "language_loss": 0.83035469, + "learning_rate": 0.0007196645810116322, + "loss": 0.8416279, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.1887207, + "step": 1947, + "time_per_iteration": 2.766355037689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142049, + "balance_loss_mlp": 1.12292802, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.07850495494132069, + "language_loss": 0.83822554, + "learning_rate": 0.0007193846717824912, + "loss": 0.84964609, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.19104004, + "step": 1948, + "time_per_iteration": 2.925459623336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133743, + "balance_loss_mlp": 1.11488414, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.08022476151722048, + "language_loss": 0.88327885, + "learning_rate": 0.0007191046773806669, + "loss": 0.89461625, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.18859863, + "step": 1949, + "time_per_iteration": 2.5894553661346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123414, + "balance_loss_mlp": 1.10373282, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.08918312945621011, + "language_loss": 0.83225584, + "learning_rate": 0.0007188245979148631, + "loss": 0.84349, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.19665527, + "step": 1950, + "time_per_iteration": 3.159851551055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126856, + "balance_loss_mlp": 1.1067214, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.11158799296642749, + "language_loss": 0.87878865, + "learning_rate": 0.0007185444334938157, + "loss": 0.89005721, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.20129395, + "step": 1951, + "time_per_iteration": 2.7033133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111192, + "balance_loss_mlp": 1.09180903, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.09975748916923241, + "language_loss": 0.8500011, + "learning_rate": 0.0007182641842262947, + "loss": 0.86111307, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.19372559, + "step": 1952, + "time_per_iteration": 2.626728057861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108209, + "balance_loss_mlp": 1.08878958, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.09334076595597436, + "language_loss": 0.77694595, + "learning_rate": 0.0007179838502211022, + "loss": 0.78802806, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.19421387, + "step": 1953, + "time_per_iteration": 2.8748068809509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106763, + "balance_loss_mlp": 1.08678353, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.0737363931585354, + "language_loss": 0.86213845, + "learning_rate": 0.0007177034315870738, + "loss": 0.87320614, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.19970703, + "step": 1954, + "time_per_iteration": 2.961113929748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110959, + "balance_loss_mlp": 1.08933675, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.08944632819393537, + "language_loss": 0.91041321, + "learning_rate": 0.0007174229284330773, + "loss": 0.92150909, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.20239258, + "step": 1955, + "time_per_iteration": 2.6537580490112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113343, + "balance_loss_mlp": 1.09273195, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.10287168416480917, + "language_loss": 0.86629105, + "learning_rate": 0.0007171423408680141, + "loss": 0.87742448, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.20605469, + "step": 1956, + "time_per_iteration": 2.814793348312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106345, + "balance_loss_mlp": 1.08584106, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.10543893351617999, + "language_loss": 0.89721847, + "learning_rate": 0.0007168616690008176, + "loss": 0.90828192, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.20495605, + "step": 1957, + "time_per_iteration": 2.6851284503936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098402, + "balance_loss_mlp": 1.07823181, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.08262297472790796, + "language_loss": 0.85860795, + "learning_rate": 0.0007165809129404545, + "loss": 0.86959195, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.20166016, + "step": 1958, + "time_per_iteration": 2.756485939025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.08695424, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.08262792958454514, + "language_loss": 0.85935986, + "learning_rate": 0.0007163000727959239, + "loss": 0.87042725, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.19775391, + "step": 1959, + "time_per_iteration": 2.525435447692871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070977, + "balance_loss_mlp": 1.06053388, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.03547956764144784, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79030049, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.10449219, + "step": 1960, + "time_per_iteration": 4.89080286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149436, + "balance_loss_mlp": 1.13035011, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.06578569091259368, + "language_loss": 0.84412438, + "learning_rate": 0.00071573814069052, + "loss": 0.85561872, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.19067383, + "step": 1961, + "time_per_iteration": 2.9070186614990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173736, + "balance_loss_mlp": 1.15444791, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.18582927476215966, + "language_loss": 0.87659955, + "learning_rate": 0.0007154570489478081, + "loss": 0.8883369, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.19274902, + "step": 1962, + "time_per_iteration": 3.2049379348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173644, + "balance_loss_mlp": 1.15447557, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.14724331795419812, + "language_loss": 0.86293024, + "learning_rate": 0.0007151758735572514, + "loss": 0.87466669, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.19152832, + "step": 1963, + "time_per_iteration": 3.0349316596984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142067, + "balance_loss_mlp": 1.12338686, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.0939989250476118, + "language_loss": 0.80074733, + "learning_rate": 0.0007148946146280119, + "loss": 0.812168, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.18676758, + "step": 1964, + "time_per_iteration": 2.8144431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048428, + "balance_loss_mlp": 1.03836632, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.021748901232604565, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73240578, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.10058594, + "step": 1965, + "time_per_iteration": 4.930070400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055709, + "balance_loss_mlp": 1.04559994, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.023739163757957975, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76397657, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.10107422, + "step": 1966, + "time_per_iteration": 4.934873580932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137343, + "balance_loss_mlp": 1.11776876, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.08213272343580422, + "language_loss": 0.83802509, + "learning_rate": 0.0007140503377003022, + "loss": 0.84939849, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.19555664, + "step": 1967, + "time_per_iteration": 3.0881879329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139491, + "balance_loss_mlp": 1.11967874, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.1174729362064234, + "language_loss": 0.84845448, + "learning_rate": 0.000713768745708599, + "loss": 0.85984945, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.19799805, + "step": 1968, + "time_per_iteration": 2.635103225708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150253, + "balance_loss_mlp": 1.12999952, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.12024050748438767, + "language_loss": 0.77237123, + "learning_rate": 0.0007134870707245085, + "loss": 0.7838738, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.20251465, + "step": 1969, + "time_per_iteration": 3.2765696048736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137246, + "balance_loss_mlp": 1.11786246, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.12719814054785675, + "language_loss": 0.84604537, + "learning_rate": 0.0007132053128573864, + "loss": 0.85741782, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.19372559, + "step": 1970, + "time_per_iteration": 2.741464614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134845, + "balance_loss_mlp": 1.11534226, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.07594331821705162, + "language_loss": 0.83660662, + "learning_rate": 0.0007129234722166211, + "loss": 0.84795505, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.19482422, + "step": 1971, + "time_per_iteration": 2.879617214202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150531, + "balance_loss_mlp": 1.13185048, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.10702357186833415, + "language_loss": 0.90689349, + "learning_rate": 0.0007126415489116328, + "loss": 0.91839886, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.18676758, + "step": 1972, + "time_per_iteration": 2.7060065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177798, + "balance_loss_mlp": 1.15965438, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.08068810601979462, + "language_loss": 0.81252205, + "learning_rate": 0.0007123595430518736, + "loss": 0.82429999, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.18151855, + "step": 1973, + "time_per_iteration": 2.872903823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217278, + "balance_loss_mlp": 1.19866943, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.10171747912447733, + "language_loss": 0.86328602, + "learning_rate": 0.0007120774547468282, + "loss": 0.87545884, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.18591309, + "step": 1974, + "time_per_iteration": 2.5397889614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240679, + "balance_loss_mlp": 1.22244012, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.14549097169765346, + "language_loss": 0.81380564, + "learning_rate": 0.0007117952841060128, + "loss": 0.82621247, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.18249512, + "step": 1975, + "time_per_iteration": 2.6751859188079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203512, + "balance_loss_mlp": 1.18491578, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.08096849874764685, + "language_loss": 0.8358916, + "learning_rate": 0.0007115130312389756, + "loss": 0.84792668, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.18579102, + "step": 1976, + "time_per_iteration": 2.6997742652893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194849, + "balance_loss_mlp": 1.17584705, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.0836403104795401, + "language_loss": 0.78931224, + "learning_rate": 0.0007112306962552973, + "loss": 0.80126077, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.18994141, + "step": 1977, + "time_per_iteration": 2.6066653728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177391, + "balance_loss_mlp": 1.15869951, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.0835848576107689, + "language_loss": 0.84830624, + "learning_rate": 0.0007109482792645896, + "loss": 0.86008012, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.18676758, + "step": 1978, + "time_per_iteration": 2.7217793464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163855, + "balance_loss_mlp": 1.14444792, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.18446881037378643, + "language_loss": 0.83627468, + "learning_rate": 0.0007106657803764969, + "loss": 0.84791327, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.19384766, + "step": 1979, + "time_per_iteration": 2.7421200275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142527, + "balance_loss_mlp": 1.12388265, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.07567906441681438, + "language_loss": 0.81599772, + "learning_rate": 0.0007103831997006948, + "loss": 0.82742298, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.18652344, + "step": 1980, + "time_per_iteration": 2.7659311294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137326, + "balance_loss_mlp": 1.11770415, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.10880870313335556, + "language_loss": 0.85352248, + "learning_rate": 0.0007101005373468908, + "loss": 0.86489582, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.19628906, + "step": 1981, + "time_per_iteration": 2.8786306381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130638, + "balance_loss_mlp": 1.11189866, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.09193767407328653, + "language_loss": 0.86793411, + "learning_rate": 0.0007098177934248242, + "loss": 0.87924051, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.18737793, + "step": 1982, + "time_per_iteration": 2.7491414546966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112619, + "balance_loss_mlp": 1.10644913, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.08063581171786138, + "language_loss": 0.85497284, + "learning_rate": 0.0007095349680442661, + "loss": 0.86623472, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.1973877, + "step": 1983, + "time_per_iteration": 2.8513927459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123414, + "balance_loss_mlp": 1.10408998, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.1315455004610476, + "language_loss": 0.79132575, + "learning_rate": 0.0007092520613150188, + "loss": 0.80255985, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.19299316, + "step": 1984, + "time_per_iteration": 2.7137770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122797, + "balance_loss_mlp": 1.1034615, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.07682315674204161, + "language_loss": 0.81457669, + "learning_rate": 0.0007089690733469165, + "loss": 0.82580465, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.1932373, + "step": 1985, + "time_per_iteration": 2.7019522190093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153334, + "balance_loss_mlp": 1.13452315, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.10399563311309594, + "language_loss": 0.82318014, + "learning_rate": 0.000708686004249825, + "loss": 0.83471346, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.18811035, + "step": 1986, + "time_per_iteration": 2.797624111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115288, + "balance_loss_mlp": 1.13355637, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.07772659738204864, + "language_loss": 0.91482198, + "learning_rate": 0.0007084028541336413, + "loss": 0.92635083, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.19299316, + "step": 1987, + "time_per_iteration": 2.7236177921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159354, + "balance_loss_mlp": 1.13969636, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.13308271196687566, + "language_loss": 0.86052763, + "learning_rate": 0.0007081196231082942, + "loss": 0.87212121, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.19641113, + "step": 1988, + "time_per_iteration": 2.837611198425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141061, + "balance_loss_mlp": 1.12171304, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.1253750556073725, + "language_loss": 0.79903424, + "learning_rate": 0.0007078363112837436, + "loss": 0.81044483, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.19335938, + "step": 1989, + "time_per_iteration": 2.8450546264648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135085, + "balance_loss_mlp": 1.11594021, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.06314586189395412, + "language_loss": 0.8480984, + "learning_rate": 0.000707552918769981, + "loss": 0.85944927, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.19128418, + "step": 1990, + "time_per_iteration": 2.5055301189422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117815, + "balance_loss_mlp": 1.09837222, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.09018786790446763, + "language_loss": 0.8355186, + "learning_rate": 0.000707269445677029, + "loss": 0.84669679, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.19433594, + "step": 1991, + "time_per_iteration": 2.790247917175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120171, + "balance_loss_mlp": 1.10065699, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.07803627169317769, + "language_loss": 0.8551231, + "learning_rate": 0.0007069858921149416, + "loss": 0.86632484, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.19494629, + "step": 1992, + "time_per_iteration": 2.9850950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128282, + "balance_loss_mlp": 1.10929155, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.08439673282063015, + "language_loss": 0.86369681, + "learning_rate": 0.0007067022581938043, + "loss": 0.87497962, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.18981934, + "step": 1993, + "time_per_iteration": 2.838817834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120054, + "balance_loss_mlp": 1.10115981, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.10464401531680585, + "language_loss": 0.83076423, + "learning_rate": 0.0007064185440237334, + "loss": 0.84196478, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.18884277, + "step": 1994, + "time_per_iteration": 2.7403006553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113897, + "balance_loss_mlp": 1.09485924, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.07520001194530918, + "language_loss": 0.8432954, + "learning_rate": 0.0007061347497148764, + "loss": 0.85443437, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.19018555, + "step": 1995, + "time_per_iteration": 2.797116994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117711, + "balance_loss_mlp": 1.0988524, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.10442861201560887, + "language_loss": 0.86312652, + "learning_rate": 0.0007058508753774122, + "loss": 0.87430364, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.18847656, + "step": 1996, + "time_per_iteration": 2.708909511566162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111759, + "balance_loss_mlp": 1.098791, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.07371207674818485, + "language_loss": 0.86599022, + "learning_rate": 0.0007055669211215505, + "loss": 0.87716615, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.18786621, + "step": 1997, + "time_per_iteration": 2.639425277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129297, + "balance_loss_mlp": 1.11073565, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.10349237512498541, + "language_loss": 0.77684987, + "learning_rate": 0.0007052828870575322, + "loss": 0.7881428, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.1854248, + "step": 1998, + "time_per_iteration": 2.6582653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141105, + "balance_loss_mlp": 1.12290192, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.06112561257491971, + "language_loss": 0.8669157, + "learning_rate": 0.0007049987732956291, + "loss": 0.87832677, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.18212891, + "step": 1999, + "time_per_iteration": 2.9868295192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130964, + "balance_loss_mlp": 1.11211705, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.05929570453342199, + "language_loss": 0.82587528, + "learning_rate": 0.0007047145799461439, + "loss": 0.83718491, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.18835449, + "step": 2000, + "time_per_iteration": 2.8687593936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136368, + "balance_loss_mlp": 1.11759257, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.08059531994541343, + "language_loss": 0.82050723, + "learning_rate": 0.00070443030711941, + "loss": 0.83187091, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.18762207, + "step": 2001, + "time_per_iteration": 2.7824347019195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113557, + "balance_loss_mlp": 1.11636579, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.09146293400396303, + "language_loss": 0.8213051, + "learning_rate": 0.0007041459549257924, + "loss": 0.83266079, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.19189453, + "step": 2002, + "time_per_iteration": 2.8634302616119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137137, + "balance_loss_mlp": 1.11758697, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.08512403296601297, + "language_loss": 0.78107333, + "learning_rate": 0.0007038615234756859, + "loss": 0.79244471, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.1953125, + "step": 2003, + "time_per_iteration": 3.2058236598968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136926, + "balance_loss_mlp": 1.11745918, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.07973278859066837, + "language_loss": 0.840294, + "learning_rate": 0.000703577012879517, + "loss": 0.85166335, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.19458008, + "step": 2004, + "time_per_iteration": 2.7286102771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144109, + "balance_loss_mlp": 1.12510681, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.07975228006523119, + "language_loss": 0.88714588, + "learning_rate": 0.0007032924232477423, + "loss": 0.89858699, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.19006348, + "step": 2005, + "time_per_iteration": 2.6980981826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136738, + "balance_loss_mlp": 1.11721206, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.08525396844891328, + "language_loss": 0.8036226, + "learning_rate": 0.0007030077546908493, + "loss": 0.81499004, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.19506836, + "step": 2006, + "time_per_iteration": 2.6433420181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225281, + "balance_loss_mlp": 1.21288347, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.07049383229006134, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84289944, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.12402344, + "step": 2007, + "time_per_iteration": 4.82226037979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113221, + "balance_loss_mlp": 1.11288631, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.07446306607004384, + "language_loss": 0.78622216, + "learning_rate": 0.0007024381812438117, + "loss": 0.7975443, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.19299316, + "step": 2008, + "time_per_iteration": 2.52738618850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128683, + "balance_loss_mlp": 1.10928798, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.09860455371344472, + "language_loss": 0.82941681, + "learning_rate": 0.0007021532765747951, + "loss": 0.84070361, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.19396973, + "step": 2009, + "time_per_iteration": 3.007847309112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135681, + "balance_loss_mlp": 1.115821, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.08526755269117656, + "language_loss": 0.79078948, + "learning_rate": 0.0007018682934229162, + "loss": 0.80214632, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.1986084, + "step": 2010, + "time_per_iteration": 2.9435882568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122545, + "balance_loss_mlp": 1.10262537, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.06758132101189684, + "language_loss": 0.82111001, + "learning_rate": 0.0007015832318988152, + "loss": 0.83233541, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.19909668, + "step": 2011, + "time_per_iteration": 2.6552624702453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_mlp": 1.03133512, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.01882295684379882, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74933803, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.12402344, + "step": 2012, + "time_per_iteration": 5.011860609054565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111441, + "balance_loss_mlp": 1.09159219, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.07301389252885741, + "language_loss": 0.84162498, + "learning_rate": 0.0007010128741766604, + "loss": 0.85273933, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.19836426, + "step": 2013, + "time_per_iteration": 2.766516923904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111771, + "balance_loss_mlp": 1.09080195, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.10834212581561939, + "language_loss": 0.84428859, + "learning_rate": 0.0007007275782000391, + "loss": 0.85540634, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.20983887, + "step": 2014, + "time_per_iteration": 2.6184933185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108065, + "balance_loss_mlp": 1.08796668, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.07735715793711462, + "language_loss": 0.8448838, + "learning_rate": 0.0007004422042940605, + "loss": 0.85596442, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.20092773, + "step": 2015, + "time_per_iteration": 2.5543320178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109418, + "balance_loss_mlp": 1.08941483, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.08270873816767256, + "language_loss": 0.89443475, + "learning_rate": 0.0007001567525695169, + "loss": 0.9055289, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.19995117, + "step": 2016, + "time_per_iteration": 2.6072936058044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106093, + "balance_loss_mlp": 1.08593512, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.06162053071135558, + "language_loss": 0.83763885, + "learning_rate": 0.0006998712231372303, + "loss": 0.84869981, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.20166016, + "step": 2017, + "time_per_iteration": 3.0785679817199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110727, + "balance_loss_mlp": 1.08730268, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06865572989075389, + "language_loss": 0.86015558, + "learning_rate": 0.0006995856161080532, + "loss": 0.87122822, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.19958496, + "step": 2018, + "time_per_iteration": 2.8914577960968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112506, + "balance_loss_mlp": 1.09202576, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.07931380391873609, + "language_loss": 0.82694459, + "learning_rate": 0.0006992999315928679, + "loss": 0.83806968, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.20483398, + "step": 2019, + "time_per_iteration": 2.7892749309539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110031, + "balance_loss_mlp": 1.08994412, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.08754557392654386, + "language_loss": 0.85419971, + "learning_rate": 0.0006990141697025871, + "loss": 0.8653, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.20080566, + "step": 2020, + "time_per_iteration": 2.7910003662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_mlp": 1.02712286, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.02439767662091094, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77398252, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.11474609, + "step": 2021, + "time_per_iteration": 4.809415340423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10596848, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0885537285439357, + "language_loss": 0.82239556, + "learning_rate": 0.0006984424142405392, + "loss": 0.83365172, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.19641113, + "step": 2022, + "time_per_iteration": 2.8510379791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124515, + "balance_loss_mlp": 1.10540605, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.08944143564846467, + "language_loss": 0.82328045, + "learning_rate": 0.0006981564208907474, + "loss": 0.83452559, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.19091797, + "step": 2023, + "time_per_iteration": 2.6450161933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125021, + "balance_loss_mlp": 1.10580468, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.06744861114448035, + "language_loss": 0.89889395, + "learning_rate": 0.0006978703506098102, + "loss": 0.91014421, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.19189453, + "step": 2024, + "time_per_iteration": 2.845273494720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142716, + "balance_loss_mlp": 1.12338066, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.22805579315722818, + "language_loss": 0.87903351, + "learning_rate": 0.00069758420350879, + "loss": 0.89046067, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.1932373, + "step": 2025, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147111, + "balance_loss_mlp": 1.12706041, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.08766781252639666, + "language_loss": 0.85837841, + "learning_rate": 0.000697297979698779, + "loss": 0.86984944, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.20043945, + "step": 2026, + "time_per_iteration": 2.7639670372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146919, + "balance_loss_mlp": 1.12766671, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06921765861152807, + "language_loss": 0.83379734, + "learning_rate": 0.0006970116792908992, + "loss": 0.84526652, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.19226074, + "step": 2027, + "time_per_iteration": 3.1537575721740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165828, + "balance_loss_mlp": 1.14574075, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.10608539967442848, + "language_loss": 0.81162727, + "learning_rate": 0.000696725302396302, + "loss": 0.82328546, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.20080566, + "step": 2028, + "time_per_iteration": 2.713486671447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169814, + "balance_loss_mlp": 1.14985871, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.08953149679914804, + "language_loss": 0.85771465, + "learning_rate": 0.0006964388491261692, + "loss": 0.86941278, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.19946289, + "step": 2029, + "time_per_iteration": 3.2685461044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117313, + "balance_loss_mlp": 1.15280437, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.07138064393758646, + "language_loss": 0.87465048, + "learning_rate": 0.0006961523195917114, + "loss": 0.88638175, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.20324707, + "step": 2030, + "time_per_iteration": 2.8363735675811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173533, + "balance_loss_mlp": 1.15370905, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.07919234366723153, + "language_loss": 0.78095168, + "learning_rate": 0.0006958657139041696, + "loss": 0.792687, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.19812012, + "step": 2031, + "time_per_iteration": 2.7535581588745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093882, + "balance_loss_mlp": 1.0820564, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.028372833662772774, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77806854, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.11816406, + "step": 2032, + "time_per_iteration": 4.918071508407593 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162947, + "balance_loss_mlp": 1.14219236, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.08595509799025135, + "language_loss": 0.78080893, + "learning_rate": 0.0006952922745149434, + "loss": 0.79243839, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.2076416, + "step": 2033, + "time_per_iteration": 2.6598660945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160858, + "balance_loss_mlp": 1.14035416, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.06804618944659446, + "language_loss": 0.87450963, + "learning_rate": 0.000695005441035888, + "loss": 0.88611823, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.20507812, + "step": 2034, + "time_per_iteration": 2.6846048831939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_mlp": 1.06218028, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.025244772676945967, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7479701, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.11376953, + "step": 2035, + "time_per_iteration": 4.866973638534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.12698257, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.06481204645981475, + "language_loss": 0.80968261, + "learning_rate": 0.0006944315470656863, + "loss": 0.82115912, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.20678711, + "step": 2036, + "time_per_iteration": 2.973759412765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139407, + "balance_loss_mlp": 1.11935592, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.08143475646221604, + "language_loss": 0.90850043, + "learning_rate": 0.000694144486797345, + "loss": 0.91989452, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.20043945, + "step": 2037, + "time_per_iteration": 2.736645221710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042479, + "balance_loss_mlp": 1.03184605, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.02072601949350613, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80562913, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.10644531, + "step": 2038, + "time_per_iteration": 4.651543140411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130224, + "balance_loss_mlp": 1.11029196, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.08780788201299033, + "language_loss": 0.89056122, + "learning_rate": 0.0006935701402514156, + "loss": 0.90186346, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.19921875, + "step": 2039, + "time_per_iteration": 2.610884666442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025416, + "balance_loss_mlp": 1.01525903, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.013600241372588764, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74060309, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.1015625, + "step": 2040, + "time_per_iteration": 4.982971906661987 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139694, + "balance_loss_mlp": 1.12033463, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.07758123210342138, + "language_loss": 0.84211379, + "learning_rate": 0.0006929954931031422, + "loss": 0.85351074, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.19348145, + "step": 2041, + "time_per_iteration": 3.722700595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143114, + "balance_loss_mlp": 1.12322998, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.05684242147097161, + "language_loss": 0.88287592, + "learning_rate": 0.0006927080570819805, + "loss": 0.89430702, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.19885254, + "step": 2042, + "time_per_iteration": 2.6228466033935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146557, + "balance_loss_mlp": 1.12712598, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.09880041485830528, + "language_loss": 0.80978543, + "learning_rate": 0.0006924205462449161, + "loss": 0.82125103, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.19421387, + "step": 2043, + "time_per_iteration": 2.5959606170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130878, + "balance_loss_mlp": 1.11204302, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.07421884933278829, + "language_loss": 0.81996524, + "learning_rate": 0.0006921329607035702, + "loss": 0.83127403, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.18823242, + "step": 2044, + "time_per_iteration": 3.2492971420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112622, + "balance_loss_mlp": 1.10749173, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.0837559423677037, + "language_loss": 0.87882477, + "learning_rate": 0.0006918453005695938, + "loss": 0.89008695, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.18701172, + "step": 2045, + "time_per_iteration": 2.649426221847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120666, + "balance_loss_mlp": 1.10098422, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.0619155211719984, + "language_loss": 0.84122574, + "learning_rate": 0.0006915575659546662, + "loss": 0.85243243, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.19665527, + "step": 2046, + "time_per_iteration": 2.7105627059936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109896, + "balance_loss_mlp": 1.09044123, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.0891593284161872, + "language_loss": 0.80576289, + "learning_rate": 0.0006912697569704959, + "loss": 0.81686187, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.19445801, + "step": 2047, + "time_per_iteration": 2.700460910797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117623, + "balance_loss_mlp": 1.09800088, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.09048948583390962, + "language_loss": 0.86559486, + "learning_rate": 0.0006909818737288205, + "loss": 0.87677109, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.19604492, + "step": 2048, + "time_per_iteration": 2.593365430831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122837, + "balance_loss_mlp": 1.10311985, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.0812760632256331, + "language_loss": 0.8078903, + "learning_rate": 0.000690693916341406, + "loss": 0.81911868, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.19702148, + "step": 2049, + "time_per_iteration": 2.6433444023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114252, + "balance_loss_mlp": 1.09472609, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.0788936263124851, + "language_loss": 0.82210761, + "learning_rate": 0.0006904058849200475, + "loss": 0.83325016, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.19506836, + "step": 2050, + "time_per_iteration": 2.7488439083099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114662, + "balance_loss_mlp": 1.09468246, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.10945632429468012, + "language_loss": 0.8477484, + "learning_rate": 0.0006901177795765683, + "loss": 0.858895, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.19970703, + "step": 2051, + "time_per_iteration": 2.6071059703826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101751, + "balance_loss_mlp": 1.08223617, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.07628310806963638, + "language_loss": 0.81390727, + "learning_rate": 0.0006898296004228213, + "loss": 0.82492483, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.19494629, + "step": 2052, + "time_per_iteration": 2.725609540939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195172, + "balance_loss_mlp": 1.18334627, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.06244005501870815, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79321915, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.11816406, + "step": 2053, + "time_per_iteration": 4.871281862258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111122, + "balance_loss_mlp": 1.09123778, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.08281763462186637, + "language_loss": 0.79986715, + "learning_rate": 0.0006892530211320763, + "loss": 0.81097841, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.19873047, + "step": 2054, + "time_per_iteration": 2.7042620182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125901, + "balance_loss_mlp": 1.10589778, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.08642547559894523, + "language_loss": 0.83690774, + "learning_rate": 0.000688964621218926, + "loss": 0.8481667, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.19995117, + "step": 2055, + "time_per_iteration": 2.6359920501708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120805, + "balance_loss_mlp": 1.10112405, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.10380118482872411, + "language_loss": 0.79915357, + "learning_rate": 0.0006886761479432037, + "loss": 0.81036162, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.19665527, + "step": 2056, + "time_per_iteration": 2.872950792312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122886, + "balance_loss_mlp": 1.10250163, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.07844536568455973, + "language_loss": 0.8461678, + "learning_rate": 0.0006883876014169045, + "loss": 0.8573966, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.20385742, + "step": 2057, + "time_per_iteration": 2.5555264949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132813, + "balance_loss_mlp": 1.11285698, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.08268955880836791, + "language_loss": 0.90132928, + "learning_rate": 0.000688098981752052, + "loss": 0.91265738, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.19946289, + "step": 2058, + "time_per_iteration": 2.7518441677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134779, + "balance_loss_mlp": 1.11504984, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.09934928750763956, + "language_loss": 0.80161107, + "learning_rate": 0.0006878102890606982, + "loss": 0.81295884, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.19726562, + "step": 2059, + "time_per_iteration": 3.098393678665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.10231209, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.08965795352869743, + "language_loss": 0.80914015, + "learning_rate": 0.0006875215234549239, + "loss": 0.82036376, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.20043945, + "step": 2060, + "time_per_iteration": 2.591871976852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112284, + "balance_loss_mlp": 1.10284913, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.08963098282996143, + "language_loss": 0.85349464, + "learning_rate": 0.0006872326850468376, + "loss": 0.86472309, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.19995117, + "step": 2061, + "time_per_iteration": 2.7322757244110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121862, + "balance_loss_mlp": 1.10210919, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.08450203568488315, + "language_loss": 0.78602254, + "learning_rate": 0.0006869437739485762, + "loss": 0.79724109, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.19750977, + "step": 2062, + "time_per_iteration": 2.679453134536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09274697, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.07578248331540363, + "language_loss": 0.92750496, + "learning_rate": 0.0006866547902723053, + "loss": 0.93862933, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.19677734, + "step": 2063, + "time_per_iteration": 2.680661201477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100055, + "balance_loss_mlp": 1.08058822, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.07543651474129125, + "language_loss": 0.80317062, + "learning_rate": 0.000686365734130218, + "loss": 0.8141712, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.19458008, + "step": 2064, + "time_per_iteration": 2.695892095565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106834, + "balance_loss_mlp": 1.08669949, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.08078876442086359, + "language_loss": 0.84065503, + "learning_rate": 0.000686076605634536, + "loss": 0.85172331, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.20129395, + "step": 2065, + "time_per_iteration": 2.642617702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113253, + "balance_loss_mlp": 1.0935117, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.08876156008903276, + "language_loss": 0.84441757, + "learning_rate": 0.0006857874048975088, + "loss": 0.85555011, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.19726562, + "step": 2066, + "time_per_iteration": 2.6363344192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_mlp": 1.08237755, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.06515627567230846, + "language_loss": 0.87180257, + "learning_rate": 0.0006854981320314142, + "loss": 0.88282633, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.19995117, + "step": 2067, + "time_per_iteration": 2.510763645172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105961, + "balance_loss_mlp": 1.08644629, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.08362186096435482, + "language_loss": 0.86780995, + "learning_rate": 0.0006852087871485579, + "loss": 0.87886953, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.19506836, + "step": 2068, + "time_per_iteration": 2.653662919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.08698964, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.09469661693362608, + "language_loss": 0.81769943, + "learning_rate": 0.0006849193703612735, + "loss": 0.82876104, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.19177246, + "step": 2069, + "time_per_iteration": 2.7798843383789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.0750916, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.07513124412486355, + "language_loss": 0.77589542, + "learning_rate": 0.0006846298817819225, + "loss": 0.78684515, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.19873047, + "step": 2070, + "time_per_iteration": 2.984025716781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094931, + "balance_loss_mlp": 1.07543969, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.07496601113124422, + "language_loss": 0.80744815, + "learning_rate": 0.0006843403215228945, + "loss": 0.8183974, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.19482422, + "step": 2071, + "time_per_iteration": 2.4528424739837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113518, + "balance_loss_mlp": 1.09400368, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.10952507549773222, + "language_loss": 0.80553752, + "learning_rate": 0.0006840506896966065, + "loss": 0.81667268, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.19519043, + "step": 2072, + "time_per_iteration": 2.7193689346313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113479, + "balance_loss_mlp": 1.09405994, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.07287911350271854, + "language_loss": 0.81897116, + "learning_rate": 0.0006837609864155038, + "loss": 0.8301059, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.1940918, + "step": 2073, + "time_per_iteration": 2.9260082244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110629, + "balance_loss_mlp": 1.09179354, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.0731734663182413, + "language_loss": 0.83157325, + "learning_rate": 0.0006834712117920592, + "loss": 0.8426795, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.18823242, + "step": 2074, + "time_per_iteration": 2.629744052886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117154, + "balance_loss_mlp": 1.09769917, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.07643256719558747, + "language_loss": 0.85673088, + "learning_rate": 0.0006831813659387729, + "loss": 0.8679024, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.19433594, + "step": 2075, + "time_per_iteration": 2.5350148677825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116666, + "balance_loss_mlp": 1.0971514, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.07671111115245405, + "language_loss": 0.84214932, + "learning_rate": 0.0006828914489681733, + "loss": 0.85331595, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.19494629, + "step": 2076, + "time_per_iteration": 2.724330425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125458, + "balance_loss_mlp": 1.10627747, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.08210563860740908, + "language_loss": 0.85224628, + "learning_rate": 0.0006826014609928162, + "loss": 0.86350089, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.19165039, + "step": 2077, + "time_per_iteration": 2.737734079360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070244, + "balance_loss_mlp": 1.06070685, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.03932449118700248, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84269631, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.09521484, + "step": 2078, + "time_per_iteration": 4.887951612472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124282, + "balance_loss_mlp": 1.10458827, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.09240147129054761, + "language_loss": 0.80077326, + "learning_rate": 0.0006820212724781896, + "loss": 0.81201607, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.19677734, + "step": 2079, + "time_per_iteration": 2.6855874061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114733, + "balance_loss_mlp": 1.09537315, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.0724055342629082, + "language_loss": 0.84239459, + "learning_rate": 0.0006817310721641694, + "loss": 0.85354191, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.19335938, + "step": 2080, + "time_per_iteration": 2.902536392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122461, + "balance_loss_mlp": 1.10289896, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.0894692108770988, + "language_loss": 0.83972865, + "learning_rate": 0.00068144080129589, + "loss": 0.85095322, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.19543457, + "step": 2081, + "time_per_iteration": 2.613067865371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122596, + "balance_loss_mlp": 1.1030333, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.09472281695894083, + "language_loss": 0.82174724, + "learning_rate": 0.0006811504599860441, + "loss": 0.83297324, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.19555664, + "step": 2082, + "time_per_iteration": 2.6002771854400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111624, + "balance_loss_mlp": 1.09634447, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.06828551193852998, + "language_loss": 0.85353184, + "learning_rate": 0.0006808600483473526, + "loss": 0.86469424, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.19897461, + "step": 2083, + "time_per_iteration": 2.9010846614837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107422, + "balance_loss_mlp": 1.0885756, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.07802980838834611, + "language_loss": 0.8652671, + "learning_rate": 0.0006805695664925629, + "loss": 0.87634128, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.18823242, + "step": 2084, + "time_per_iteration": 2.8027803897857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111632, + "balance_loss_mlp": 1.0970912, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.08245020261724635, + "language_loss": 0.8423562, + "learning_rate": 0.0006802790145344506, + "loss": 0.85351944, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.19238281, + "step": 2085, + "time_per_iteration": 2.5397531986236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119142, + "balance_loss_mlp": 1.10039067, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07508565386227965, + "language_loss": 0.87270218, + "learning_rate": 0.0006799883925858176, + "loss": 0.88389367, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.18737793, + "step": 2086, + "time_per_iteration": 2.876164197921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112869, + "balance_loss_mlp": 1.10978329, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.07429159623595777, + "language_loss": 0.84809011, + "learning_rate": 0.0006796977007594933, + "loss": 0.85937703, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.18896484, + "step": 2087, + "time_per_iteration": 2.6302778720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136639, + "balance_loss_mlp": 1.11681485, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.06510767025647884, + "language_loss": 0.86000383, + "learning_rate": 0.0006794069391683345, + "loss": 0.8713702, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.19824219, + "step": 2088, + "time_per_iteration": 2.7642226219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125568, + "balance_loss_mlp": 1.10582721, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.07763642733040174, + "language_loss": 0.80219448, + "learning_rate": 0.0006791161079252248, + "loss": 0.81345016, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.19726562, + "step": 2089, + "time_per_iteration": 2.6216719150543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112898, + "balance_loss_mlp": 1.10969198, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.06753993516242088, + "language_loss": 0.82396168, + "learning_rate": 0.0006788252071430747, + "loss": 0.83525145, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.19262695, + "step": 2090, + "time_per_iteration": 2.6881613731384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136792, + "balance_loss_mlp": 1.11759949, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.07938192983074185, + "language_loss": 0.86496997, + "learning_rate": 0.0006785342369348222, + "loss": 0.87633789, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.19177246, + "step": 2091, + "time_per_iteration": 2.7187774181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134169, + "balance_loss_mlp": 1.11497617, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.08007566317284716, + "language_loss": 0.79674286, + "learning_rate": 0.0006782431974134316, + "loss": 0.80808461, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.19189453, + "step": 2092, + "time_per_iteration": 2.5497889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112292, + "balance_loss_mlp": 1.10301197, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.09546920549904063, + "language_loss": 0.89602369, + "learning_rate": 0.0006779520886918949, + "loss": 0.90725285, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.19897461, + "step": 2093, + "time_per_iteration": 3.070051431655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126954, + "balance_loss_mlp": 1.10783303, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.07932487566864904, + "language_loss": 0.81140947, + "learning_rate": 0.0006776609108832301, + "loss": 0.82267904, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.19116211, + "step": 2094, + "time_per_iteration": 2.8635079860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117981, + "balance_loss_mlp": 1.09895563, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.08200776323916202, + "language_loss": 0.85093951, + "learning_rate": 0.0006773696641004828, + "loss": 0.86211932, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.19006348, + "step": 2095, + "time_per_iteration": 2.569387435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119321, + "balance_loss_mlp": 1.09972358, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.09231967023328698, + "language_loss": 0.77639973, + "learning_rate": 0.0006770783484567247, + "loss": 0.78759301, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.19592285, + "step": 2096, + "time_per_iteration": 3.1237080097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109679, + "balance_loss_mlp": 1.09033108, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.07679281592908915, + "language_loss": 0.86043823, + "learning_rate": 0.000676786964065055, + "loss": 0.871535, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.1932373, + "step": 2097, + "time_per_iteration": 2.785017728805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.10181427, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.07049509838223245, + "language_loss": 0.78567326, + "learning_rate": 0.0006764955110385986, + "loss": 0.79688537, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.19384766, + "step": 2098, + "time_per_iteration": 2.7599899768829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011178, + "balance_loss_mlp": 1.09878576, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.07587511524565468, + "language_loss": 0.8025918, + "learning_rate": 0.0006762039894905083, + "loss": 0.81376982, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.19006348, + "step": 2099, + "time_per_iteration": 2.6616034507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115132, + "balance_loss_mlp": 1.09635651, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.08446355623188201, + "language_loss": 0.80088019, + "learning_rate": 0.000675912399533962, + "loss": 0.81203151, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.1875, + "step": 2100, + "time_per_iteration": 2.53584885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129908, + "balance_loss_mlp": 1.1112045, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.057425192194628195, + "language_loss": 0.84893382, + "learning_rate": 0.0006756207412821656, + "loss": 0.86023289, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.18701172, + "step": 2101, + "time_per_iteration": 3.0146372318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133046, + "balance_loss_mlp": 1.11444974, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.08385244443422216, + "language_loss": 0.79946959, + "learning_rate": 0.0006753290148483505, + "loss": 0.81080002, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.18603516, + "step": 2102, + "time_per_iteration": 3.1141843795776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131753, + "balance_loss_mlp": 1.11306119, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.10321495678621663, + "language_loss": 0.7855078, + "learning_rate": 0.0006750372203457752, + "loss": 0.79682529, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.18688965, + "step": 2103, + "time_per_iteration": 2.5273704528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133211, + "balance_loss_mlp": 1.1144712, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.06897182936898366, + "language_loss": 0.86569643, + "learning_rate": 0.0006747453578877242, + "loss": 0.87702858, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.18725586, + "step": 2104, + "time_per_iteration": 2.7731292247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136147, + "balance_loss_mlp": 1.11752641, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.08357448735589112, + "language_loss": 0.82917869, + "learning_rate": 0.0006744534275875085, + "loss": 0.84054017, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.1862793, + "step": 2105, + "time_per_iteration": 3.0466742515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148022, + "balance_loss_mlp": 1.12974763, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.09276188373090515, + "language_loss": 0.85562009, + "learning_rate": 0.0006741614295584657, + "loss": 0.8671003, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.18273926, + "step": 2106, + "time_per_iteration": 2.678776264190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115565, + "balance_loss_mlp": 1.13704157, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.0813184956351506, + "language_loss": 0.78235412, + "learning_rate": 0.0006738693639139595, + "loss": 0.79391062, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.18603516, + "step": 2107, + "time_per_iteration": 3.0155587196350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157609, + "balance_loss_mlp": 1.13920343, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.09421684944263367, + "language_loss": 0.77232802, + "learning_rate": 0.0006735772307673796, + "loss": 0.78390408, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.18408203, + "step": 2108, + "time_per_iteration": 3.586928129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165053, + "balance_loss_mlp": 1.14651608, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.06861239024528153, + "language_loss": 0.83003211, + "learning_rate": 0.0006732850302321421, + "loss": 0.84168267, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.18518066, + "step": 2109, + "time_per_iteration": 2.9429726600646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160041, + "balance_loss_mlp": 1.14086008, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.07515968908819307, + "language_loss": 0.84144229, + "learning_rate": 0.00067299276242169, + "loss": 0.85304272, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.19177246, + "step": 2110, + "time_per_iteration": 2.6710071563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044281, + "balance_loss_mlp": 1.03436232, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.023257265358085616, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75426447, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.09912109, + "step": 2111, + "time_per_iteration": 4.914813756942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151064, + "balance_loss_mlp": 1.13221717, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.09830411974127871, + "language_loss": 0.77889705, + "learning_rate": 0.0006724080254290395, + "loss": 0.79040766, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.18811035, + "step": 2112, + "time_per_iteration": 2.8067259788513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136038, + "balance_loss_mlp": 1.11665511, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.07964969066506762, + "language_loss": 0.89744002, + "learning_rate": 0.0006721155564738566, + "loss": 0.90880042, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.19360352, + "step": 2113, + "time_per_iteration": 2.7009260654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105786, + "balance_loss_mlp": 1.04798985, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.033284036056789104, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79680502, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.09863281, + "step": 2114, + "time_per_iteration": 4.983005523681641 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127405, + "balance_loss_mlp": 1.10823643, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.07850906735960049, + "language_loss": 0.85233408, + "learning_rate": 0.0006715304182135078, + "loss": 0.86360812, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.19152832, + "step": 2115, + "time_per_iteration": 2.6078672409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114305, + "balance_loss_mlp": 1.09480286, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.063032684383759, + "language_loss": 0.88685012, + "learning_rate": 0.0006712377491355127, + "loss": 0.89799315, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.19482422, + "step": 2116, + "time_per_iteration": 2.8919928073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011132, + "balance_loss_mlp": 1.09403157, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.07591389839440288, + "language_loss": 0.81216896, + "learning_rate": 0.0006709450135771274, + "loss": 0.82330096, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.19152832, + "step": 2117, + "time_per_iteration": 2.948209524154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110152, + "balance_loss_mlp": 1.09097123, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.0664106663118444, + "language_loss": 0.86270058, + "learning_rate": 0.0006706522116520023, + "loss": 0.87380207, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.19177246, + "step": 2118, + "time_per_iteration": 2.63297963142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109766, + "balance_loss_mlp": 1.09078836, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.08309315753094405, + "language_loss": 0.82646739, + "learning_rate": 0.0006703593434738127, + "loss": 0.83756506, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.18969727, + "step": 2119, + "time_per_iteration": 2.7504334449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110305, + "balance_loss_mlp": 1.08339202, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.06315918122435989, + "language_loss": 0.78157568, + "learning_rate": 0.0006700664091562604, + "loss": 0.79260623, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.1965332, + "step": 2120, + "time_per_iteration": 2.5809123516082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08968461, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.06251573302429693, + "language_loss": 0.84974718, + "learning_rate": 0.0006697734088130725, + "loss": 0.86084229, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.19812012, + "step": 2121, + "time_per_iteration": 2.6444742679595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103604, + "balance_loss_mlp": 1.08350492, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.08444724355881765, + "language_loss": 0.85282058, + "learning_rate": 0.0006694803425580018, + "loss": 0.86385661, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.20080566, + "step": 2122, + "time_per_iteration": 2.9844353199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101862, + "balance_loss_mlp": 1.08126235, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.08120556309716129, + "language_loss": 0.84838599, + "learning_rate": 0.0006691872105048268, + "loss": 0.85940456, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.20605469, + "step": 2123, + "time_per_iteration": 2.587648868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104271, + "balance_loss_mlp": 1.08323061, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.07277240915985977, + "language_loss": 0.84579539, + "learning_rate": 0.0006688940127673513, + "loss": 0.85683805, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.21044922, + "step": 2124, + "time_per_iteration": 2.6976451873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108248, + "balance_loss_mlp": 1.08663535, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.07888289921071225, + "language_loss": 0.85375637, + "learning_rate": 0.0006686007494594049, + "loss": 0.86483884, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.21618652, + "step": 2125, + "time_per_iteration": 2.842721700668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.075279, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.1494487487543463, + "language_loss": 0.80707026, + "learning_rate": 0.0006683074206948425, + "loss": 0.81803596, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.21289062, + "step": 2126, + "time_per_iteration": 2.54156231880188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088684, + "balance_loss_mlp": 1.06790602, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.07127639192135228, + "language_loss": 0.81315231, + "learning_rate": 0.0006680140265875443, + "loss": 0.82403916, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.20788574, + "step": 2127, + "time_per_iteration": 2.8282980918884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093135, + "balance_loss_mlp": 1.07241678, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.07736719826860473, + "language_loss": 0.953547, + "learning_rate": 0.0006677205672514162, + "loss": 0.96447837, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.20715332, + "step": 2128, + "time_per_iteration": 2.635601758956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089263, + "balance_loss_mlp": 1.06965339, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.07314070036202396, + "language_loss": 0.88630438, + "learning_rate": 0.000667427042800389, + "loss": 0.89719707, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.19604492, + "step": 2129, + "time_per_iteration": 2.792956829071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094452, + "balance_loss_mlp": 1.07447219, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.07258896862524182, + "language_loss": 0.82793128, + "learning_rate": 0.0006671334533484192, + "loss": 0.83887583, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.19970703, + "step": 2130, + "time_per_iteration": 2.773900270462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07694483, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.07325583153216161, + "language_loss": 0.83178955, + "learning_rate": 0.0006668397990094881, + "loss": 0.84274781, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.1887207, + "step": 2131, + "time_per_iteration": 2.752606153488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.08409071, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.08072513277707091, + "language_loss": 0.84810466, + "learning_rate": 0.0006665460798976027, + "loss": 0.85913295, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.18725586, + "step": 2132, + "time_per_iteration": 2.7918195724487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101646, + "balance_loss_mlp": 1.08277488, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.057661652953568024, + "language_loss": 0.8113941, + "learning_rate": 0.0006662522961267947, + "loss": 0.82241058, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.18859863, + "step": 2133, + "time_per_iteration": 2.7084174156188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114188, + "balance_loss_mlp": 1.09586525, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.07117823449693282, + "language_loss": 0.86957145, + "learning_rate": 0.0006659584478111211, + "loss": 0.88071334, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.18322754, + "step": 2134, + "time_per_iteration": 2.8745734691619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120532, + "balance_loss_mlp": 1.10234094, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.10436544040673855, + "language_loss": 0.82673836, + "learning_rate": 0.000665664535064664, + "loss": 0.83794367, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.1817627, + "step": 2135, + "time_per_iteration": 3.0361244678497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_mlp": 1.10167265, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.07372821186051039, + "language_loss": 0.82676935, + "learning_rate": 0.0006653705580015303, + "loss": 0.83797693, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.1907959, + "step": 2136, + "time_per_iteration": 2.6784329414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121286, + "balance_loss_mlp": 1.10184264, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.08099943161450797, + "language_loss": 0.8610462, + "learning_rate": 0.0006650765167358523, + "loss": 0.87225902, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.19421387, + "step": 2137, + "time_per_iteration": 2.8350300788879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113363, + "balance_loss_mlp": 1.09431374, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.09328592607957716, + "language_loss": 0.89696336, + "learning_rate": 0.0006647824113817864, + "loss": 0.90809703, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.19030762, + "step": 2138, + "time_per_iteration": 2.5345799922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112598, + "balance_loss_mlp": 1.09391761, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.24980936370747706, + "language_loss": 0.81674927, + "learning_rate": 0.000664488242053515, + "loss": 0.82787526, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.18688965, + "step": 2139, + "time_per_iteration": 2.729074716567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112354, + "balance_loss_mlp": 1.09430587, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.06520257719296937, + "language_loss": 0.8372556, + "learning_rate": 0.0006641940088652445, + "loss": 0.84837914, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.18054199, + "step": 2140, + "time_per_iteration": 2.822861909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114304, + "balance_loss_mlp": 1.09476542, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.09690666410410188, + "language_loss": 0.82505018, + "learning_rate": 0.0006638997119312065, + "loss": 0.8361932, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.1953125, + "step": 2141, + "time_per_iteration": 2.7164361476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081351, + "balance_loss_mlp": 1.0707655, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.03550975461959617, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.7614466, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.10595703, + "step": 2142, + "time_per_iteration": 4.951165437698364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116466, + "balance_loss_mlp": 1.09734452, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.10349541439789608, + "language_loss": 0.8488189, + "learning_rate": 0.000663310927282877, + "loss": 0.8599835, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.19116211, + "step": 2143, + "time_per_iteration": 2.834325075149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123685, + "balance_loss_mlp": 1.10346723, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.07414481576642443, + "language_loss": 0.85735166, + "learning_rate": 0.000663016439797172, + "loss": 0.86858845, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.20214844, + "step": 2144, + "time_per_iteration": 2.641390800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118144, + "balance_loss_mlp": 1.09814095, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.07853696289984005, + "language_loss": 0.80941319, + "learning_rate": 0.0006627218890228724, + "loss": 0.82059467, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.20007324, + "step": 2145, + "time_per_iteration": 2.7847142219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115703, + "balance_loss_mlp": 1.0958544, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.07518431098775835, + "language_loss": 0.83727562, + "learning_rate": 0.0006624272750743326, + "loss": 0.84843272, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.19836426, + "step": 2146, + "time_per_iteration": 3.0317938327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117359, + "balance_loss_mlp": 1.09733224, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.06462993006694184, + "language_loss": 0.8283999, + "learning_rate": 0.0006621325980659322, + "loss": 0.83957344, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.20019531, + "step": 2147, + "time_per_iteration": 2.786724328994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118, + "balance_loss_mlp": 1.0978415, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.10640671392978962, + "language_loss": 0.81600213, + "learning_rate": 0.000661837858112075, + "loss": 0.82718211, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.20153809, + "step": 2148, + "time_per_iteration": 2.854837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115633, + "balance_loss_mlp": 1.09577227, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.06752887879335369, + "language_loss": 0.88443303, + "learning_rate": 0.0006615430553271888, + "loss": 0.89558935, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.19848633, + "step": 2149, + "time_per_iteration": 2.8243539333343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115349, + "balance_loss_mlp": 1.09486902, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.06757702274708675, + "language_loss": 0.85010874, + "learning_rate": 0.0006612481898257264, + "loss": 0.8612622, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.20483398, + "step": 2150, + "time_per_iteration": 2.870486259460449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114158, + "balance_loss_mlp": 1.09377337, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.08316851802653256, + "language_loss": 0.85005617, + "learning_rate": 0.000660953261722165, + "loss": 0.86119783, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.20385742, + "step": 2151, + "time_per_iteration": 2.6056485176086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112582, + "balance_loss_mlp": 1.09265018, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.06870221870710541, + "language_loss": 0.82367688, + "learning_rate": 0.0006606582711310055, + "loss": 0.83480269, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.19934082, + "step": 2152, + "time_per_iteration": 2.7264139652252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119446, + "balance_loss_mlp": 1.09854901, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.0720639200532027, + "language_loss": 0.83059323, + "learning_rate": 0.0006603632181667736, + "loss": 0.8417877, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.20910645, + "step": 2153, + "time_per_iteration": 2.6930761337280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055226, + "balance_loss_mlp": 1.04149318, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.029268536031501605, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79998553, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.13769531, + "step": 2154, + "time_per_iteration": 4.951904773712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133368, + "balance_loss_mlp": 1.11335301, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.08213185756435645, + "language_loss": 0.81797659, + "learning_rate": 0.0006597729255773153, + "loss": 0.82931024, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.20007324, + "step": 2155, + "time_per_iteration": 2.6153218746185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142819, + "balance_loss_mlp": 1.1224227, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.0847752552783981, + "language_loss": 0.82203597, + "learning_rate": 0.0006594776861812608, + "loss": 0.83346415, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.20397949, + "step": 2156, + "time_per_iteration": 2.68922758102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153838, + "balance_loss_mlp": 1.13410926, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.06809079383741527, + "language_loss": 0.86262864, + "learning_rate": 0.0006591823848704776, + "loss": 0.87416703, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.1973877, + "step": 2157, + "time_per_iteration": 2.9523754119873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147297, + "balance_loss_mlp": 1.12693584, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.07690135227418383, + "language_loss": 0.81358635, + "learning_rate": 0.0006588870217596117, + "loss": 0.82505929, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.20361328, + "step": 2158, + "time_per_iteration": 2.7730822563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140876, + "balance_loss_mlp": 1.12146926, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.08370852265526307, + "language_loss": 0.857876, + "learning_rate": 0.0006585915969633334, + "loss": 0.86928475, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.19396973, + "step": 2159, + "time_per_iteration": 2.6628706455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133859, + "balance_loss_mlp": 1.1143918, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.07868666241976846, + "language_loss": 0.8926276, + "learning_rate": 0.0006582961105963366, + "loss": 0.90396619, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.19445801, + "step": 2160, + "time_per_iteration": 2.856227397918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126433, + "balance_loss_mlp": 1.10702562, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.10110909063497833, + "language_loss": 0.77701914, + "learning_rate": 0.0006580005627733395, + "loss": 0.78828347, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.19396973, + "step": 2161, + "time_per_iteration": 2.763690948486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131659, + "balance_loss_mlp": 1.11281204, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.0788483903527846, + "language_loss": 0.81671721, + "learning_rate": 0.0006577049536090838, + "loss": 0.8280338, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.18823242, + "step": 2162, + "time_per_iteration": 2.7156083583831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130408, + "balance_loss_mlp": 1.11114359, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.08609543464950487, + "language_loss": 0.85536218, + "learning_rate": 0.000657409283218335, + "loss": 0.8666662, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.19250488, + "step": 2163, + "time_per_iteration": 2.711332082748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135342, + "balance_loss_mlp": 1.11707878, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.08463355465100361, + "language_loss": 0.81072271, + "learning_rate": 0.0006571135517158829, + "loss": 0.82207608, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.18273926, + "step": 2164, + "time_per_iteration": 2.6715452671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_mlp": 1.01865911, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.01758070932569607, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77793115, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.10400391, + "step": 2165, + "time_per_iteration": 4.765650272369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154419, + "balance_loss_mlp": 1.13588202, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.09117992314911788, + "language_loss": 0.828076, + "learning_rate": 0.0006565219058351444, + "loss": 0.83962023, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.18530273, + "step": 2166, + "time_per_iteration": 2.568162202835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160705, + "balance_loss_mlp": 1.14153659, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.1435965153845973, + "language_loss": 0.82720423, + "learning_rate": 0.0006562259916865553, + "loss": 0.83881128, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.19165039, + "step": 2167, + "time_per_iteration": 2.577831506729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146961, + "balance_loss_mlp": 1.12813759, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.10197305761412122, + "language_loss": 0.79348731, + "learning_rate": 0.0006559300168856573, + "loss": 0.80495691, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.18798828, + "step": 2168, + "time_per_iteration": 2.7849843502044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143742, + "balance_loss_mlp": 1.12485933, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.07754195288754885, + "language_loss": 0.86023396, + "learning_rate": 0.0006556339815473577, + "loss": 0.87167138, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.1887207, + "step": 2169, + "time_per_iteration": 2.7085328102111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142677, + "balance_loss_mlp": 1.12390125, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.08981224380419678, + "language_loss": 0.86090291, + "learning_rate": 0.000655337885786588, + "loss": 0.87232965, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.1875, + "step": 2170, + "time_per_iteration": 2.9244213104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128382, + "balance_loss_mlp": 1.10963011, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.08419137591764536, + "language_loss": 0.8483454, + "learning_rate": 0.0006550417297183025, + "loss": 0.85962915, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.18737793, + "step": 2171, + "time_per_iteration": 2.6424126625061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116149, + "balance_loss_mlp": 1.09746861, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.07276027667818112, + "language_loss": 0.81700563, + "learning_rate": 0.0006547455134574793, + "loss": 0.82816714, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.18664551, + "step": 2172, + "time_per_iteration": 2.743807315826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118053, + "balance_loss_mlp": 1.09920597, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06582530373346562, + "language_loss": 0.83907378, + "learning_rate": 0.0006544492371191198, + "loss": 0.85025424, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.18847656, + "step": 2173, + "time_per_iteration": 3.1398048400878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112075, + "balance_loss_mlp": 1.09203625, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.07927924785081189, + "language_loss": 0.83028531, + "learning_rate": 0.0006541529008182485, + "loss": 0.84140611, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.20031738, + "step": 2174, + "time_per_iteration": 3.218675136566162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113128, + "balance_loss_mlp": 1.09423363, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.08063752220274202, + "language_loss": 0.87068301, + "learning_rate": 0.0006538565046699136, + "loss": 0.88181424, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.18884277, + "step": 2175, + "time_per_iteration": 2.623373031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110698, + "balance_loss_mlp": 1.09179151, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.10224918928766584, + "language_loss": 0.80967259, + "learning_rate": 0.0006535600487891862, + "loss": 0.82077956, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.18896484, + "step": 2176, + "time_per_iteration": 2.858027935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108436, + "balance_loss_mlp": 1.08948123, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.0620502143296578, + "language_loss": 0.88827038, + "learning_rate": 0.0006532635332911603, + "loss": 0.89935476, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.1895752, + "step": 2177, + "time_per_iteration": 2.6979219913482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126272, + "balance_loss_mlp": 1.10786629, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.06643064450406437, + "language_loss": 0.80475914, + "learning_rate": 0.0006529669582909541, + "loss": 0.81602192, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.18408203, + "step": 2178, + "time_per_iteration": 3.246621608734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112675, + "balance_loss_mlp": 1.10820079, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.08441696273800357, + "language_loss": 0.85626066, + "learning_rate": 0.0006526703239037077, + "loss": 0.8675282, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.1854248, + "step": 2179, + "time_per_iteration": 2.67114520072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10779428, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.07577304920294069, + "language_loss": 0.86212498, + "learning_rate": 0.0006523736302445851, + "loss": 0.8733927, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.18969727, + "step": 2180, + "time_per_iteration": 2.7883896827697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132192, + "balance_loss_mlp": 1.11371422, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.08559665169482955, + "language_loss": 0.77047896, + "learning_rate": 0.0006520768774287728, + "loss": 0.78180093, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.18469238, + "step": 2181, + "time_per_iteration": 3.777104616165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127516, + "balance_loss_mlp": 1.10862184, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06395892384144108, + "language_loss": 0.85356331, + "learning_rate": 0.0006517800655714806, + "loss": 0.86483848, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.18884277, + "step": 2182, + "time_per_iteration": 2.8449056148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116562, + "balance_loss_mlp": 1.09781027, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07104751702384272, + "language_loss": 0.85029149, + "learning_rate": 0.0006514831947879407, + "loss": 0.86145711, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.18737793, + "step": 2183, + "time_per_iteration": 2.990061044692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107738, + "balance_loss_mlp": 1.08917689, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.10339737087855795, + "language_loss": 0.78075212, + "learning_rate": 0.0006511862651934091, + "loss": 0.79182947, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.18566895, + "step": 2184, + "time_per_iteration": 3.0668697357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107089, + "balance_loss_mlp": 1.08805084, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.06769253041220206, + "language_loss": 0.8183164, + "learning_rate": 0.0006508892769031638, + "loss": 0.82938731, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.19018555, + "step": 2185, + "time_per_iteration": 2.6562998294830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109571, + "balance_loss_mlp": 1.0908668, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.09820566679610492, + "language_loss": 0.86607713, + "learning_rate": 0.000650592230032506, + "loss": 0.87717283, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.18676758, + "step": 2186, + "time_per_iteration": 2.7687323093414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_mlp": 1.09592557, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.07480815577141971, + "language_loss": 0.84954965, + "learning_rate": 0.0006502951246967595, + "loss": 0.86070257, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.19360352, + "step": 2187, + "time_per_iteration": 2.8850929737091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105112, + "balance_loss_mlp": 1.0856576, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.07526055561420332, + "language_loss": 0.86650884, + "learning_rate": 0.0006499979610112706, + "loss": 0.87756002, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.19445801, + "step": 2188, + "time_per_iteration": 2.6973655223846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09087813, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.09941258674264111, + "language_loss": 0.84241974, + "learning_rate": 0.000649700739091409, + "loss": 0.85352778, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.19921875, + "step": 2189, + "time_per_iteration": 2.701725482940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067004, + "balance_loss_mlp": 1.05665708, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.03283150548513283, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.7490328, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.10351562, + "step": 2190, + "time_per_iteration": 4.839817523956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.07154024, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.06598643326088396, + "language_loss": 0.85153967, + "learning_rate": 0.0006491061210101557, + "loss": 0.86244869, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.19335938, + "step": 2191, + "time_per_iteration": 2.7196173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010923, + "balance_loss_mlp": 1.07263041, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.0656106941658015, + "language_loss": 0.83940744, + "learning_rate": 0.0006488087250796157, + "loss": 0.85033047, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.1965332, + "step": 2192, + "time_per_iteration": 2.906759262084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_mlp": 1.07264447, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.07249831154737209, + "language_loss": 0.81628364, + "learning_rate": 0.0006485112713764049, + "loss": 0.82721323, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.203125, + "step": 2193, + "time_per_iteration": 2.92899227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094124, + "balance_loss_mlp": 1.0746212, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.06737861087768351, + "language_loss": 0.83769715, + "learning_rate": 0.0006482137600160051, + "loss": 0.8486383, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.19506836, + "step": 2194, + "time_per_iteration": 2.5262770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085984, + "balance_loss_mlp": 1.06623149, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.06292139363287808, + "language_loss": 0.845213, + "learning_rate": 0.0006479161911139206, + "loss": 0.85607278, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.1973877, + "step": 2195, + "time_per_iteration": 2.6160459518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108518, + "balance_loss_mlp": 1.06428266, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08901996634588341, + "language_loss": 0.8583566, + "learning_rate": 0.0006476185647856778, + "loss": 0.8692084, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.2088623, + "step": 2196, + "time_per_iteration": 2.5868523120880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092161, + "balance_loss_mlp": 1.07174015, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.08593083287674207, + "language_loss": 0.8143295, + "learning_rate": 0.0006473208811468255, + "loss": 0.8252511, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.20422363, + "step": 2197, + "time_per_iteration": 2.8999974727630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094808, + "balance_loss_mlp": 1.07459044, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.06766081582077942, + "language_loss": 0.84457636, + "learning_rate": 0.0006470231403129347, + "loss": 0.85552448, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.20214844, + "step": 2198, + "time_per_iteration": 2.6292834281921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_mlp": 1.08031106, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.06420895179660353, + "language_loss": 0.81433302, + "learning_rate": 0.0006467253423995988, + "loss": 0.82533306, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.19677734, + "step": 2199, + "time_per_iteration": 2.891252040863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106456, + "balance_loss_mlp": 1.08667946, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.09520170564639865, + "language_loss": 0.79070157, + "learning_rate": 0.000646427487522433, + "loss": 0.80176616, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.19763184, + "step": 2200, + "time_per_iteration": 2.6773481369018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114103, + "balance_loss_mlp": 1.09451675, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.05852623049494667, + "language_loss": 0.8313483, + "learning_rate": 0.0006461295757970749, + "loss": 0.84248924, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.19567871, + "step": 2201, + "time_per_iteration": 2.8689796924591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134362, + "balance_loss_mlp": 1.11422753, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.08800937436321304, + "language_loss": 0.8125912, + "learning_rate": 0.0006458316073391839, + "loss": 0.82393485, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.20141602, + "step": 2202, + "time_per_iteration": 2.88208270072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131451, + "balance_loss_mlp": 1.11259222, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.0666307669316128, + "language_loss": 0.87698853, + "learning_rate": 0.0006455335822644422, + "loss": 0.88830304, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.18847656, + "step": 2203, + "time_per_iteration": 2.670079469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148041, + "balance_loss_mlp": 1.12951636, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.09426146221356531, + "language_loss": 0.77927971, + "learning_rate": 0.0006452355006885527, + "loss": 0.79076016, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.18530273, + "step": 2204, + "time_per_iteration": 2.657381534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113566, + "balance_loss_mlp": 1.11668229, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.09902645475712538, + "language_loss": 0.8715145, + "learning_rate": 0.0006449373627272412, + "loss": 0.88287115, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.18969727, + "step": 2205, + "time_per_iteration": 2.731816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119544, + "balance_loss_mlp": 1.10088801, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.08117714281203407, + "language_loss": 0.82472396, + "learning_rate": 0.0006446391684962553, + "loss": 0.8359195, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.18652344, + "step": 2206, + "time_per_iteration": 2.6578545570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111897, + "balance_loss_mlp": 1.09364557, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.07468362398894425, + "language_loss": 0.83251357, + "learning_rate": 0.000644340918111364, + "loss": 0.84363258, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.18249512, + "step": 2207, + "time_per_iteration": 2.56805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116636, + "balance_loss_mlp": 1.09764564, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.07806782722385266, + "language_loss": 0.84652972, + "learning_rate": 0.0006440426116883585, + "loss": 0.85769606, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.18981934, + "step": 2208, + "time_per_iteration": 2.5546016693115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117381, + "balance_loss_mlp": 1.09860539, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.06957413499154663, + "language_loss": 0.86008334, + "learning_rate": 0.0006437442493430519, + "loss": 0.87125719, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.18762207, + "step": 2209, + "time_per_iteration": 2.709622621536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.09817648, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.07293604534963509, + "language_loss": 0.86852837, + "learning_rate": 0.000643445831191278, + "loss": 0.87969142, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.18127441, + "step": 2210, + "time_per_iteration": 2.9363558292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129839, + "balance_loss_mlp": 1.11201715, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.09052715570846585, + "language_loss": 0.81454134, + "learning_rate": 0.0006431473573488937, + "loss": 0.82583976, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.17834473, + "step": 2211, + "time_per_iteration": 2.824688196182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113072, + "balance_loss_mlp": 1.09480882, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.1062817873742978, + "language_loss": 0.8489396, + "learning_rate": 0.0006428488279317765, + "loss": 0.86007035, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.18273926, + "step": 2212, + "time_per_iteration": 2.7016141414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115888, + "balance_loss_mlp": 1.0979948, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.11732172807674658, + "language_loss": 0.87377149, + "learning_rate": 0.0006425502430558259, + "loss": 0.88493037, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.17907715, + "step": 2213, + "time_per_iteration": 2.618800640106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119747, + "balance_loss_mlp": 1.10144818, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.0715384053232906, + "language_loss": 0.84687829, + "learning_rate": 0.0006422516028369628, + "loss": 0.85807574, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.18310547, + "step": 2214, + "time_per_iteration": 2.6705808639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111255, + "balance_loss_mlp": 1.09299207, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.10790889315219483, + "language_loss": 0.83148849, + "learning_rate": 0.0006419529073911296, + "loss": 0.84260106, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.18261719, + "step": 2215, + "time_per_iteration": 2.8703150749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129195, + "balance_loss_mlp": 1.11081314, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.06359649877678734, + "language_loss": 0.85258245, + "learning_rate": 0.0006416541568342901, + "loss": 0.86387444, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.18383789, + "step": 2216, + "time_per_iteration": 2.8891868591308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150464, + "balance_loss_mlp": 1.13197434, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.08324056394919786, + "language_loss": 0.84084767, + "learning_rate": 0.0006413553512824297, + "loss": 0.85235232, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.18481445, + "step": 2217, + "time_per_iteration": 2.7485709190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.13043642, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.07361406588428895, + "language_loss": 0.84362692, + "learning_rate": 0.0006410564908515549, + "loss": 0.85511333, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.18200684, + "step": 2218, + "time_per_iteration": 2.657747507095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147496, + "balance_loss_mlp": 1.12895846, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.08313238940479123, + "language_loss": 0.85059869, + "learning_rate": 0.0006407575756576935, + "loss": 0.86207366, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.18530273, + "step": 2219, + "time_per_iteration": 2.7391462326049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151304, + "balance_loss_mlp": 1.13211131, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.08558880584649159, + "language_loss": 0.87292302, + "learning_rate": 0.0006404586058168951, + "loss": 0.88443601, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.19189453, + "step": 2220, + "time_per_iteration": 2.7562613487243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142405, + "balance_loss_mlp": 1.12310505, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.08712204240656665, + "language_loss": 0.86527437, + "learning_rate": 0.0006401595814452296, + "loss": 0.87669843, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.19287109, + "step": 2221, + "time_per_iteration": 2.6396138668060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120795, + "balance_loss_mlp": 1.10141122, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.07683160316407273, + "language_loss": 0.80591571, + "learning_rate": 0.000639860502658789, + "loss": 0.81712359, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.19360352, + "step": 2222, + "time_per_iteration": 2.655627489089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115469, + "balance_loss_mlp": 1.09618044, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.0619683298423062, + "language_loss": 0.85100698, + "learning_rate": 0.0006395613695736853, + "loss": 0.86216164, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.19287109, + "step": 2223, + "time_per_iteration": 2.701129674911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103811, + "balance_loss_mlp": 1.08472598, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.07797079059499014, + "language_loss": 0.81455553, + "learning_rate": 0.0006392621823060529, + "loss": 0.82559359, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.19067383, + "step": 2224, + "time_per_iteration": 2.7364578247070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099286, + "balance_loss_mlp": 1.08043897, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.08496205952123127, + "language_loss": 0.84790826, + "learning_rate": 0.0006389629409720465, + "loss": 0.85890114, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.18835449, + "step": 2225, + "time_per_iteration": 2.673173427581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109636, + "balance_loss_mlp": 1.07715571, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.0715414323965843, + "language_loss": 0.88466454, + "learning_rate": 0.0006386636456878417, + "loss": 0.89562809, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.19177246, + "step": 2226, + "time_per_iteration": 2.9119651317596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098271, + "balance_loss_mlp": 1.07898331, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.09078876082736503, + "language_loss": 0.91914666, + "learning_rate": 0.0006383642965696353, + "loss": 0.93012941, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.19262695, + "step": 2227, + "time_per_iteration": 2.546172618865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105185, + "balance_loss_mlp": 1.08565903, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.10289049243839221, + "language_loss": 0.83054781, + "learning_rate": 0.000638064893733645, + "loss": 0.84159964, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.19506836, + "step": 2228, + "time_per_iteration": 2.752192735671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110948, + "balance_loss_mlp": 1.09085989, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.15473525900744378, + "language_loss": 0.89614534, + "learning_rate": 0.000637765437296109, + "loss": 0.90724015, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.18615723, + "step": 2229, + "time_per_iteration": 2.6742892265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106608, + "balance_loss_mlp": 1.08742726, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.06911950421263405, + "language_loss": 0.8512131, + "learning_rate": 0.000637465927373287, + "loss": 0.86227918, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.19165039, + "step": 2230, + "time_per_iteration": 2.6567254066467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103693, + "balance_loss_mlp": 1.08500099, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.08280955993669904, + "language_loss": 0.78714275, + "learning_rate": 0.000637166364081459, + "loss": 0.79817969, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.18688965, + "step": 2231, + "time_per_iteration": 2.671881914138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118758, + "balance_loss_mlp": 1.10104382, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.10217834412041502, + "language_loss": 0.84177876, + "learning_rate": 0.0006368667475369256, + "loss": 0.85296631, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.17736816, + "step": 2232, + "time_per_iteration": 2.760406732559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_mlp": 1.03175175, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.029167273687310865, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79569829, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.10302734, + "step": 2233, + "time_per_iteration": 4.915542840957642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_mlp": 1.02887213, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.028672121204767892, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79934502, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.10205078, + "step": 2234, + "time_per_iteration": 4.8368518352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158883, + "balance_loss_mlp": 1.14040589, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.1071521836349002, + "language_loss": 0.85815042, + "learning_rate": 0.0006359675795504112, + "loss": 0.86973917, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.18481445, + "step": 2235, + "time_per_iteration": 2.689207077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157771, + "balance_loss_mlp": 1.13929391, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.08968188926211089, + "language_loss": 0.74473494, + "learning_rate": 0.0006356677511584775, + "loss": 0.75631261, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.18481445, + "step": 2236, + "time_per_iteration": 3.4835057258605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140943, + "balance_loss_mlp": 1.12231028, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.07661214353194774, + "language_loss": 0.86188674, + "learning_rate": 0.0006353678700956511, + "loss": 0.8732962, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.18615723, + "step": 2237, + "time_per_iteration": 2.5932724475860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122051, + "balance_loss_mlp": 1.10363352, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.10135375141644645, + "language_loss": 0.83612645, + "learning_rate": 0.0006350679364783569, + "loss": 0.84734702, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.1842041, + "step": 2238, + "time_per_iteration": 2.799832582473755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116492, + "balance_loss_mlp": 1.09846783, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.08578747749075483, + "language_loss": 0.85542685, + "learning_rate": 0.0006347679504230393, + "loss": 0.86659181, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.18041992, + "step": 2239, + "time_per_iteration": 2.692394971847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101167, + "balance_loss_mlp": 1.08270121, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.07961944034188723, + "language_loss": 0.76030314, + "learning_rate": 0.0006344679120461632, + "loss": 0.77131486, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.18444824, + "step": 2240, + "time_per_iteration": 3.3374927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095492, + "balance_loss_mlp": 1.07701421, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.0793940534533153, + "language_loss": 0.7985338, + "learning_rate": 0.0006341678214642134, + "loss": 0.80948877, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.18469238, + "step": 2241, + "time_per_iteration": 2.6277148723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106602, + "balance_loss_mlp": 1.08830297, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.08042276557968771, + "language_loss": 0.82835627, + "learning_rate": 0.0006338676787936963, + "loss": 0.83942229, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.18286133, + "step": 2242, + "time_per_iteration": 3.1297900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108169, + "balance_loss_mlp": 1.08982253, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.09204417916973401, + "language_loss": 0.8383373, + "learning_rate": 0.0006335674841511367, + "loss": 0.84941894, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.18347168, + "step": 2243, + "time_per_iteration": 2.667814254760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093207, + "balance_loss_mlp": 1.08414674, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.03538748768114217, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80274379, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.09082031, + "step": 2244, + "time_per_iteration": 4.997291803359985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085049, + "balance_loss_mlp": 1.07603705, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.03507908076143408, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78450596, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.09033203, + "step": 2245, + "time_per_iteration": 4.884565591812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114789, + "balance_loss_mlp": 1.09558439, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.08187280769981854, + "language_loss": 0.82496786, + "learning_rate": 0.0006326665895567652, + "loss": 0.83611572, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.1920166, + "step": 2246, + "time_per_iteration": 2.6677396297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123468, + "balance_loss_mlp": 1.10469246, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.08598825839477024, + "language_loss": 0.86984897, + "learning_rate": 0.0006323661881916976, + "loss": 0.88108367, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.1875, + "step": 2247, + "time_per_iteration": 2.7388386726379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117292, + "balance_loss_mlp": 1.09908867, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.06738996012815959, + "language_loss": 0.80918467, + "learning_rate": 0.0006320657354375179, + "loss": 0.82035756, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.18212891, + "step": 2248, + "time_per_iteration": 3.047557830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_mlp": 1.11192417, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.08033421843515161, + "language_loss": 0.86710787, + "learning_rate": 0.0006317652314108726, + "loss": 0.8784107, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.18347168, + "step": 2249, + "time_per_iteration": 2.547611713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121909, + "balance_loss_mlp": 1.10406351, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.07824522100123071, + "language_loss": 0.91323555, + "learning_rate": 0.0006314646762284277, + "loss": 0.92445469, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.17858887, + "step": 2250, + "time_per_iteration": 2.648721933364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024653, + "balance_loss_mlp": 1.01502049, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.012196079218770799, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76450479, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.09619141, + "step": 2251, + "time_per_iteration": 4.9720799922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134181, + "balance_loss_mlp": 1.11596584, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.07706489930265227, + "language_loss": 0.77657586, + "learning_rate": 0.0006308634128629022, + "loss": 0.78791773, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.18225098, + "step": 2252, + "time_per_iteration": 2.898723602294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131863, + "balance_loss_mlp": 1.11357653, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.09977200174188003, + "language_loss": 0.87270236, + "learning_rate": 0.0006305627049132531, + "loss": 0.88402092, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.18286133, + "step": 2253, + "time_per_iteration": 2.854081153869629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120951, + "balance_loss_mlp": 1.1019249, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.08155008814068082, + "language_loss": 0.8592571, + "learning_rate": 0.0006302619462746662, + "loss": 0.87046659, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.19018555, + "step": 2254, + "time_per_iteration": 3.164759397506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111221, + "balance_loss_mlp": 1.09445965, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.0732322900076577, + "language_loss": 0.90031815, + "learning_rate": 0.0006299611370639069, + "loss": 0.91144025, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.17773438, + "step": 2255, + "time_per_iteration": 2.753937005996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111399, + "balance_loss_mlp": 1.09258795, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.07459277492074774, + "language_loss": 0.79176068, + "learning_rate": 0.0006296602773977593, + "loss": 0.80287468, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.18798828, + "step": 2256, + "time_per_iteration": 2.720043659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111381, + "balance_loss_mlp": 1.09282053, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.06314614385855079, + "language_loss": 0.873402, + "learning_rate": 0.0006293593673930277, + "loss": 0.88451576, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.18566895, + "step": 2257, + "time_per_iteration": 2.7014408111572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102122, + "balance_loss_mlp": 1.0837394, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07573255135522808, + "language_loss": 0.78537059, + "learning_rate": 0.0006290584071665358, + "loss": 0.79639179, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.18371582, + "step": 2258, + "time_per_iteration": 2.9237425327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109661, + "balance_loss_mlp": 1.09070623, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.09488327166679841, + "language_loss": 0.82044512, + "learning_rate": 0.0006287573968351266, + "loss": 0.83154172, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.18945312, + "step": 2259, + "time_per_iteration": 2.574779748916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100612, + "balance_loss_mlp": 1.08195579, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.08898100409874855, + "language_loss": 0.82007015, + "learning_rate": 0.0006284563365156626, + "loss": 0.83107626, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.18652344, + "step": 2260, + "time_per_iteration": 2.8346612453460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107563, + "balance_loss_mlp": 1.08845389, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.09100088182337301, + "language_loss": 0.87183499, + "learning_rate": 0.0006281552263250261, + "loss": 0.88291061, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.19116211, + "step": 2261, + "time_per_iteration": 2.549306631088257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054669, + "balance_loss_mlp": 1.04460812, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.02508916228462863, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81746203, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.10058594, + "step": 2262, + "time_per_iteration": 4.837932348251343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104657, + "balance_loss_mlp": 1.08554804, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.08522062407758652, + "language_loss": 0.81544203, + "learning_rate": 0.0006275528567978593, + "loss": 0.82648861, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.19091797, + "step": 2263, + "time_per_iteration": 2.936924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112769, + "balance_loss_mlp": 1.09411263, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.07411268466258768, + "language_loss": 0.826931, + "learning_rate": 0.0006272515976951898, + "loss": 0.83805871, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.18640137, + "step": 2264, + "time_per_iteration": 3.0930423736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107636, + "balance_loss_mlp": 1.08872962, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.09109036690828846, + "language_loss": 0.79239774, + "learning_rate": 0.0006269502891890687, + "loss": 0.80347407, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.18896484, + "step": 2265, + "time_per_iteration": 3.0183792114257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107502, + "balance_loss_mlp": 1.08883369, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.05550243860706018, + "language_loss": 0.87779111, + "learning_rate": 0.0006266489313964743, + "loss": 0.88886613, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.18652344, + "step": 2266, + "time_per_iteration": 2.7831835746765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.10263872, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.0703513545387446, + "language_loss": 0.85298383, + "learning_rate": 0.0006263475244344041, + "loss": 0.86419421, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.18395996, + "step": 2267, + "time_per_iteration": 2.857132911682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118688, + "balance_loss_mlp": 1.10052013, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.08642791248778911, + "language_loss": 0.84379327, + "learning_rate": 0.0006260460684198746, + "loss": 0.85498011, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.1817627, + "step": 2268, + "time_per_iteration": 2.692237138748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107605, + "balance_loss_mlp": 1.08955705, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.0923795472926113, + "language_loss": 0.84379983, + "learning_rate": 0.0006257445634699213, + "loss": 0.85487592, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.18066406, + "step": 2269, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113893, + "balance_loss_mlp": 1.0958451, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.07185982898842977, + "language_loss": 0.82919574, + "learning_rate": 0.0006254430097015993, + "loss": 0.84033465, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.18054199, + "step": 2270, + "time_per_iteration": 2.70414662361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039786, + "balance_loss_mlp": 1.02981973, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.018847560898896817, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.7751888, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.09960938, + "step": 2271, + "time_per_iteration": 4.881477355957031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.09232235, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.06834440940873689, + "language_loss": 0.85169542, + "learning_rate": 0.0006248397561781609, + "loss": 0.86278993, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.17138672, + "step": 2272, + "time_per_iteration": 2.9807589054107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114956, + "balance_loss_mlp": 1.09752727, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.08779020279595867, + "language_loss": 0.85627788, + "learning_rate": 0.0006245380566572482, + "loss": 0.86742747, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.17456055, + "step": 2273, + "time_per_iteration": 2.6780998706817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113394, + "balance_loss_mlp": 1.09640646, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07304773845504615, + "language_loss": 0.75851929, + "learning_rate": 0.0006242363087863744, + "loss": 0.7696532, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.17004395, + "step": 2274, + "time_per_iteration": 2.9744510650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116168, + "balance_loss_mlp": 1.0989182, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.1377417309618575, + "language_loss": 0.86166036, + "learning_rate": 0.0006239345126826878, + "loss": 0.87282199, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.17272949, + "step": 2275, + "time_per_iteration": 2.7981135845184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108683, + "balance_loss_mlp": 1.09152877, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.07859590561046474, + "language_loss": 0.83992988, + "learning_rate": 0.0006236326684633561, + "loss": 0.8510167, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.17175293, + "step": 2276, + "time_per_iteration": 2.818861722946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112467, + "balance_loss_mlp": 1.09526503, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.07703424900820159, + "language_loss": 0.74875319, + "learning_rate": 0.0006233307762455658, + "loss": 0.75987786, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.17224121, + "step": 2277, + "time_per_iteration": 2.6329345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113593, + "balance_loss_mlp": 1.09641492, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.08103172587748399, + "language_loss": 0.83020627, + "learning_rate": 0.0006230288361465216, + "loss": 0.84134221, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.17199707, + "step": 2278, + "time_per_iteration": 3.093740701675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121877, + "balance_loss_mlp": 1.10465097, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.0865781646571655, + "language_loss": 0.8464967, + "learning_rate": 0.0006227268482834473, + "loss": 0.85771543, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.17248535, + "step": 2279, + "time_per_iteration": 2.9201176166534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125481, + "balance_loss_mlp": 1.10830259, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.07906200997295257, + "language_loss": 0.86881065, + "learning_rate": 0.000622424812773585, + "loss": 0.88006544, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.17199707, + "step": 2280, + "time_per_iteration": 2.8375024795532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133302, + "balance_loss_mlp": 1.11602879, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.07902412331438459, + "language_loss": 0.79696977, + "learning_rate": 0.000622122729734195, + "loss": 0.80830276, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.17285156, + "step": 2281, + "time_per_iteration": 2.587625741958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127945, + "balance_loss_mlp": 1.11082637, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.06489318495758713, + "language_loss": 0.87247634, + "learning_rate": 0.0006218205992825566, + "loss": 0.8837558, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.17138672, + "step": 2282, + "time_per_iteration": 2.6426842212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132622, + "balance_loss_mlp": 1.11561131, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.07249325505401696, + "language_loss": 0.81692946, + "learning_rate": 0.0006215184215359671, + "loss": 0.82825571, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.17016602, + "step": 2283, + "time_per_iteration": 2.7548625469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131603, + "balance_loss_mlp": 1.11440063, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.07525739768421633, + "language_loss": 0.86762762, + "learning_rate": 0.0006212161966117425, + "loss": 0.87894368, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.17224121, + "step": 2284, + "time_per_iteration": 2.738553762435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131945, + "balance_loss_mlp": 1.11446857, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.077553661572433, + "language_loss": 0.81615996, + "learning_rate": 0.0006209139246272164, + "loss": 0.82747942, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.17492676, + "step": 2285, + "time_per_iteration": 3.024388074874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133353, + "balance_loss_mlp": 1.11548376, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.07341525875363067, + "language_loss": 0.81566632, + "learning_rate": 0.0006206116056997421, + "loss": 0.8269999, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.17871094, + "step": 2286, + "time_per_iteration": 2.5751805305480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130534, + "balance_loss_mlp": 1.11304617, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.0674295682524957, + "language_loss": 0.82623774, + "learning_rate": 0.0006203092399466892, + "loss": 0.83754307, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.17504883, + "step": 2287, + "time_per_iteration": 2.557861566543579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142808, + "balance_loss_mlp": 1.12514091, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.055585597684010626, + "language_loss": 0.84940028, + "learning_rate": 0.0006200068274854473, + "loss": 0.8608284, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.17700195, + "step": 2288, + "time_per_iteration": 2.6604013442993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139053, + "balance_loss_mlp": 1.12110031, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.05756252195592342, + "language_loss": 0.85686207, + "learning_rate": 0.0006197043684334229, + "loss": 0.86825264, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.17956543, + "step": 2289, + "time_per_iteration": 2.7742552757263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136744, + "balance_loss_mlp": 1.11905324, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.09031384979596896, + "language_loss": 0.78885317, + "learning_rate": 0.0006194018629080411, + "loss": 0.80022061, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.17712402, + "step": 2290, + "time_per_iteration": 2.755141019821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143468, + "balance_loss_mlp": 1.12530041, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.10381992178140695, + "language_loss": 0.81444335, + "learning_rate": 0.0006190993110267451, + "loss": 0.82587808, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.18164062, + "step": 2291, + "time_per_iteration": 2.717245578765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138273, + "balance_loss_mlp": 1.1200701, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.06842071551306793, + "language_loss": 0.84298384, + "learning_rate": 0.0006187967129069958, + "loss": 0.8543666, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.18212891, + "step": 2292, + "time_per_iteration": 2.540931463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139526, + "balance_loss_mlp": 1.12121558, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.07329037094919502, + "language_loss": 0.86953282, + "learning_rate": 0.0006184940686662722, + "loss": 0.88092804, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.18322754, + "step": 2293, + "time_per_iteration": 2.7757341861724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140768, + "balance_loss_mlp": 1.1223979, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.08855099948535183, + "language_loss": 0.89983863, + "learning_rate": 0.0006181913784220714, + "loss": 0.9112463, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.18371582, + "step": 2294, + "time_per_iteration": 2.723515510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040412, + "balance_loss_mlp": 1.03092277, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.030293744399198016, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81594193, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.09472656, + "step": 2295, + "time_per_iteration": 4.940914630889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125035, + "balance_loss_mlp": 1.10708177, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.07282895932349266, + "language_loss": 0.79783386, + "learning_rate": 0.0006175858603933146, + "loss": 0.80908418, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.1796875, + "step": 2296, + "time_per_iteration": 2.9011893272399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117973, + "balance_loss_mlp": 1.09999609, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.07093452663269637, + "language_loss": 0.80995864, + "learning_rate": 0.0006172830328438416, + "loss": 0.82113832, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.17993164, + "step": 2297, + "time_per_iteration": 2.984313726425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115845, + "balance_loss_mlp": 1.09765363, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.06543332431983825, + "language_loss": 0.87005913, + "learning_rate": 0.0006169801597610572, + "loss": 0.8812176, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.18212891, + "step": 2298, + "time_per_iteration": 2.7446672916412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105385, + "balance_loss_mlp": 1.08803988, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.09691889340683667, + "language_loss": 0.89723885, + "learning_rate": 0.0006166772412625469, + "loss": 0.90829265, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.17358398, + "step": 2299, + "time_per_iteration": 2.8357315063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107801, + "balance_loss_mlp": 1.08962202, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.10216386732709903, + "language_loss": 0.81670028, + "learning_rate": 0.0006163742774659141, + "loss": 0.82777828, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.1817627, + "step": 2300, + "time_per_iteration": 2.886781692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095311, + "balance_loss_mlp": 1.07751346, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.07973359147829089, + "language_loss": 0.85959738, + "learning_rate": 0.0006160712684887801, + "loss": 0.87055051, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.17822266, + "step": 2301, + "time_per_iteration": 2.7916574478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109187, + "balance_loss_mlp": 1.07431102, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.06808021774790461, + "language_loss": 0.82115805, + "learning_rate": 0.0006157682144487832, + "loss": 0.83207679, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.17565918, + "step": 2302, + "time_per_iteration": 2.795738458633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094413, + "balance_loss_mlp": 1.07613826, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.0749153625811459, + "language_loss": 0.83107322, + "learning_rate": 0.0006154651154635793, + "loss": 0.84201735, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.18273926, + "step": 2303, + "time_per_iteration": 4.31014609336853 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090587, + "balance_loss_mlp": 1.07243156, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.07642073153592485, + "language_loss": 0.84451294, + "learning_rate": 0.0006151619716508421, + "loss": 0.8554188, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.18164062, + "step": 2304, + "time_per_iteration": 2.6006975173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090647, + "balance_loss_mlp": 1.07205081, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.07612741560937177, + "language_loss": 0.87099224, + "learning_rate": 0.0006148587831282625, + "loss": 0.8818987, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.18591309, + "step": 2305, + "time_per_iteration": 2.7009835243225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048095, + "balance_loss_mlp": 1.03808129, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.019656861653556033, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80224162, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.10009766, + "step": 2306, + "time_per_iteration": 4.9429686069488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108493, + "balance_loss_mlp": 1.06604683, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.07723488854599227, + "language_loss": 0.87132251, + "learning_rate": 0.0006142522724244255, + "loss": 0.88217181, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.18884277, + "step": 2307, + "time_per_iteration": 2.553419828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_mlp": 1.02589071, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.014915460519873193, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77520525, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.09912109, + "step": 2308, + "time_per_iteration": 4.877639055252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085887, + "balance_loss_mlp": 1.06711113, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.07688151387557987, + "language_loss": 0.77357888, + "learning_rate": 0.000613645584293942, + "loss": 0.78443772, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.18762207, + "step": 2309, + "time_per_iteration": 2.9022634029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.06968963, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.08682478727714991, + "language_loss": 0.83149701, + "learning_rate": 0.0006133421739881185, + "loss": 0.84238064, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.18664551, + "step": 2310, + "time_per_iteration": 2.6619491577148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090945, + "balance_loss_mlp": 1.07256329, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.08001840232131298, + "language_loss": 0.82499826, + "learning_rate": 0.0006130387196789605, + "loss": 0.8359077, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.18359375, + "step": 2311, + "time_per_iteration": 2.761312246322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081759, + "balance_loss_mlp": 1.06348383, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.06942740185124545, + "language_loss": 0.84283984, + "learning_rate": 0.0006127352214842795, + "loss": 0.85365742, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.1829834, + "step": 2312, + "time_per_iteration": 2.9890031814575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083785, + "balance_loss_mlp": 1.06565332, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.07063181629976649, + "language_loss": 0.85067087, + "learning_rate": 0.0006124316795219041, + "loss": 0.86150873, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.18139648, + "step": 2313, + "time_per_iteration": 2.7978243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.06714296, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.08238507288636325, + "language_loss": 0.82411474, + "learning_rate": 0.0006121280939096794, + "loss": 0.83496892, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.1829834, + "step": 2314, + "time_per_iteration": 2.767470121383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087652, + "balance_loss_mlp": 1.06994963, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.09711161856626577, + "language_loss": 0.87964773, + "learning_rate": 0.000611824464765468, + "loss": 0.89052415, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.17712402, + "step": 2315, + "time_per_iteration": 2.58632493019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019027, + "balance_loss_mlp": 1.00934732, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.012462298147770837, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79613966, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.09667969, + "step": 2316, + "time_per_iteration": 4.68027400970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097979, + "balance_loss_mlp": 1.08057404, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.09030294554601531, + "language_loss": 0.85568595, + "learning_rate": 0.000611217076352619, + "loss": 0.86666572, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.17419434, + "step": 2317, + "time_per_iteration": 2.8745946884155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.07860303, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.06320933370201777, + "language_loss": 0.83313119, + "learning_rate": 0.0006109133173197905, + "loss": 0.84409374, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.17675781, + "step": 2318, + "time_per_iteration": 2.719902515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104888, + "balance_loss_mlp": 1.08753085, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.07491768608262588, + "language_loss": 0.85073888, + "learning_rate": 0.0006106095152265935, + "loss": 0.86178774, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.17370605, + "step": 2319, + "time_per_iteration": 3.004857063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111563, + "balance_loss_mlp": 1.0939796, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.08385510801007982, + "language_loss": 0.84405756, + "learning_rate": 0.0006103056701909739, + "loss": 0.85517317, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.17602539, + "step": 2320, + "time_per_iteration": 2.966923475265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113274, + "balance_loss_mlp": 1.09577405, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.07685766834781843, + "language_loss": 0.8301264, + "learning_rate": 0.0006100017823308956, + "loss": 0.84125912, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.17504883, + "step": 2321, + "time_per_iteration": 3.204850196838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112805, + "balance_loss_mlp": 1.11025262, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.08670302679562208, + "language_loss": 0.79305983, + "learning_rate": 0.0006096978517643377, + "loss": 0.80434036, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.17797852, + "step": 2322, + "time_per_iteration": 2.860180139541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112379, + "balance_loss_mlp": 1.10644507, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.12580563915967458, + "language_loss": 0.83188093, + "learning_rate": 0.0006093938786092968, + "loss": 0.84311885, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.17358398, + "step": 2323, + "time_per_iteration": 2.64030122756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124017, + "balance_loss_mlp": 1.10691094, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.06761406024518349, + "language_loss": 0.89442849, + "learning_rate": 0.0006090898629837857, + "loss": 0.90566862, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.17126465, + "step": 2324, + "time_per_iteration": 2.8378353118896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137759, + "balance_loss_mlp": 1.1204021, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.1896235015526922, + "language_loss": 0.87233531, + "learning_rate": 0.0006087858050058337, + "loss": 0.88371289, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.17370605, + "step": 2325, + "time_per_iteration": 2.829404830932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131095, + "balance_loss_mlp": 1.1135118, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.07181125336629572, + "language_loss": 0.82417965, + "learning_rate": 0.0006084817047934866, + "loss": 0.83549058, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.17590332, + "step": 2326, + "time_per_iteration": 2.68251371383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134989, + "balance_loss_mlp": 1.11732209, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.08385131470703, + "language_loss": 0.89333081, + "learning_rate": 0.0006081775624648066, + "loss": 0.90468073, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.17675781, + "step": 2327, + "time_per_iteration": 2.533090591430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138131, + "balance_loss_mlp": 1.12101269, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.10743629798598615, + "language_loss": 0.82534277, + "learning_rate": 0.0006078733781378721, + "loss": 0.83672416, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.17138672, + "step": 2328, + "time_per_iteration": 2.597377061843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111818, + "balance_loss_mlp": 1.10090625, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.07758231479291984, + "language_loss": 0.82049984, + "learning_rate": 0.0006075691519307781, + "loss": 0.83168161, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.17297363, + "step": 2329, + "time_per_iteration": 2.8866052627563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110227, + "balance_loss_mlp": 1.09251261, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.0702768888062288, + "language_loss": 0.81606984, + "learning_rate": 0.0006072648839616356, + "loss": 0.82717204, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.17724609, + "step": 2330, + "time_per_iteration": 2.7015554904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114358, + "balance_loss_mlp": 1.09686995, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.07321658937944422, + "language_loss": 0.82347071, + "learning_rate": 0.0006069605743485718, + "loss": 0.83461428, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.1751709, + "step": 2331, + "time_per_iteration": 3.3698229789733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110303, + "balance_loss_mlp": 1.09319615, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.07314304322377065, + "language_loss": 0.83288682, + "learning_rate": 0.0006066562232097303, + "loss": 0.84398985, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.17126465, + "step": 2332, + "time_per_iteration": 2.7595224380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109542, + "balance_loss_mlp": 1.09135079, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.07260034454336384, + "language_loss": 0.86063141, + "learning_rate": 0.0006063518306632708, + "loss": 0.87172687, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.18200684, + "step": 2333, + "time_per_iteration": 2.973802089691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110869, + "balance_loss_mlp": 1.09335709, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.0724353146925312, + "language_loss": 0.82143402, + "learning_rate": 0.0006060473968273688, + "loss": 0.83254278, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.1751709, + "step": 2334, + "time_per_iteration": 2.716792583465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_mlp": 1.02476275, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.01941960869972046, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78913647, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.09326172, + "step": 2335, + "time_per_iteration": 4.891199827194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025292, + "balance_loss_mlp": 1.01608956, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.01646335982957884, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82030511, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.09179688, + "step": 2336, + "time_per_iteration": 4.873430013656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112894, + "balance_loss_mlp": 1.09513164, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.18670212144629325, + "language_loss": 0.88409269, + "learning_rate": 0.0006051338487650047, + "loss": 0.89522159, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.17785645, + "step": 2337, + "time_per_iteration": 2.4702365398406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106326, + "balance_loss_mlp": 1.08833754, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.08397051973497069, + "language_loss": 0.82701272, + "learning_rate": 0.0006048292509534095, + "loss": 0.83807594, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.17993164, + "step": 2338, + "time_per_iteration": 2.619450569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_mlp": 1.08850312, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.20046859342765924, + "language_loss": 0.77607334, + "learning_rate": 0.0006045246124434895, + "loss": 0.78713191, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.17370605, + "step": 2339, + "time_per_iteration": 2.7321267127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105214, + "balance_loss_mlp": 1.08761835, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.08075651314496221, + "language_loss": 0.865839, + "learning_rate": 0.0006042199333535162, + "loss": 0.8768912, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.17614746, + "step": 2340, + "time_per_iteration": 3.306898832321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_mlp": 1.08355892, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06823291393488413, + "language_loss": 0.83802176, + "learning_rate": 0.0006039152138017763, + "loss": 0.84903181, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.17443848, + "step": 2341, + "time_per_iteration": 3.1458027362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104806, + "balance_loss_mlp": 1.08727062, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.08305826290941032, + "language_loss": 0.83554494, + "learning_rate": 0.0006036104539065726, + "loss": 0.84659296, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.17541504, + "step": 2342, + "time_per_iteration": 2.6648519039154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102434, + "balance_loss_mlp": 1.08492208, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.06158872344302024, + "language_loss": 0.84248793, + "learning_rate": 0.000603305653786223, + "loss": 0.85351223, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.17529297, + "step": 2343, + "time_per_iteration": 3.176680326461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113287, + "balance_loss_mlp": 1.09581113, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.0747059506481359, + "language_loss": 0.84228522, + "learning_rate": 0.0006030008135590622, + "loss": 0.85341805, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.17480469, + "step": 2344, + "time_per_iteration": 2.742253065109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124309, + "balance_loss_mlp": 1.10722649, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.058134829204836994, + "language_loss": 0.799905, + "learning_rate": 0.0006026959333434387, + "loss": 0.81114811, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.17102051, + "step": 2345, + "time_per_iteration": 2.779311180114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132846, + "balance_loss_mlp": 1.11552477, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.07509063772314063, + "language_loss": 0.77367598, + "learning_rate": 0.0006023910132577181, + "loss": 0.78500438, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.17346191, + "step": 2346, + "time_per_iteration": 2.6779799461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113297, + "balance_loss_mlp": 1.11554205, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.10491289793116987, + "language_loss": 0.84559381, + "learning_rate": 0.0006020860534202806, + "loss": 0.85692352, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.17443848, + "step": 2347, + "time_per_iteration": 2.528663158416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135795, + "balance_loss_mlp": 1.1183548, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.07098609761882418, + "language_loss": 0.80898821, + "learning_rate": 0.0006017810539495224, + "loss": 0.82034618, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.17468262, + "step": 2348, + "time_per_iteration": 2.9910202026367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111241, + "balance_loss_mlp": 1.09382474, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.07527105168067424, + "language_loss": 0.82186049, + "learning_rate": 0.0006014760149638547, + "loss": 0.83297288, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.17431641, + "step": 2349, + "time_per_iteration": 2.667600631713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124509, + "balance_loss_mlp": 1.10764134, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.07463444501983019, + "language_loss": 0.88244182, + "learning_rate": 0.000601170936581704, + "loss": 0.89368689, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.16870117, + "step": 2350, + "time_per_iteration": 2.5531952381134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124884, + "balance_loss_mlp": 1.10763478, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.07303827993658786, + "language_loss": 0.84088361, + "learning_rate": 0.0006008658189215121, + "loss": 0.85213244, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.17260742, + "step": 2351, + "time_per_iteration": 2.6667087078094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122786, + "balance_loss_mlp": 1.10538173, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.08019313993326724, + "language_loss": 0.80211049, + "learning_rate": 0.0006005606621017366, + "loss": 0.81333834, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.17419434, + "step": 2352, + "time_per_iteration": 2.5864298343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.09249294, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.08588176709504687, + "language_loss": 0.80108917, + "learning_rate": 0.0006002554662408496, + "loss": 0.81219029, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.1763916, + "step": 2353, + "time_per_iteration": 2.921902656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106203, + "balance_loss_mlp": 1.08839345, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.08839686088246723, + "language_loss": 0.91245115, + "learning_rate": 0.0005999502314573388, + "loss": 0.92351323, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.17822266, + "step": 2354, + "time_per_iteration": 2.6538503170013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098374, + "balance_loss_mlp": 1.08077872, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.07972814176434397, + "language_loss": 0.85777891, + "learning_rate": 0.0005996449578697066, + "loss": 0.86876267, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.17590332, + "step": 2355, + "time_per_iteration": 2.6249992847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112931, + "balance_loss_mlp": 1.09541893, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.0715197090101731, + "language_loss": 0.81223947, + "learning_rate": 0.0005993396455964709, + "loss": 0.82336879, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.17541504, + "step": 2356, + "time_per_iteration": 2.69350266456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111792, + "balance_loss_mlp": 1.0944469, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.07234166204840274, + "language_loss": 0.81097758, + "learning_rate": 0.0005990342947561647, + "loss": 0.82209545, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.17358398, + "step": 2357, + "time_per_iteration": 2.7173328399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123689, + "balance_loss_mlp": 1.10639215, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.09230022277941517, + "language_loss": 0.78124547, + "learning_rate": 0.0005987289054673351, + "loss": 0.79248238, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.1730957, + "step": 2358, + "time_per_iteration": 2.633007526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108071, + "balance_loss_mlp": 1.09800935, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.0537090739321762, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77683806, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.10058594, + "step": 2359, + "time_per_iteration": 4.852884769439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011235, + "balance_loss_mlp": 1.10622633, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.07905851512069884, + "language_loss": 0.91134411, + "learning_rate": 0.0005981180120183722, + "loss": 0.92257917, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.17285156, + "step": 2360, + "time_per_iteration": 2.7044413089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119068, + "balance_loss_mlp": 1.10053074, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.05732939327341075, + "language_loss": 0.85087699, + "learning_rate": 0.0005978125080954089, + "loss": 0.8620677, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.18530273, + "step": 2361, + "time_per_iteration": 2.775712251663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.08805668, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.0789619101325961, + "language_loss": 0.7727446, + "learning_rate": 0.000597506966198262, + "loss": 0.78380114, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.17614746, + "step": 2362, + "time_per_iteration": 2.974111557006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110357, + "balance_loss_mlp": 1.08590329, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.0858902108709268, + "language_loss": 0.83994937, + "learning_rate": 0.0005972013864455536, + "loss": 0.85098517, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.17675781, + "step": 2363, + "time_per_iteration": 2.6244583129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101233, + "balance_loss_mlp": 1.08366108, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.08015454662625561, + "language_loss": 0.851372, + "learning_rate": 0.0005968957689559203, + "loss": 0.86238432, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.17602539, + "step": 2364, + "time_per_iteration": 2.6717097759246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098129, + "balance_loss_mlp": 1.08035493, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.07229553193462525, + "language_loss": 0.88592815, + "learning_rate": 0.0005965901138480131, + "loss": 0.89690942, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.17785645, + "step": 2365, + "time_per_iteration": 2.653158664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098766, + "balance_loss_mlp": 1.08063412, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.07319480450828385, + "language_loss": 0.87207007, + "learning_rate": 0.0005962844212404982, + "loss": 0.88305777, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.18151855, + "step": 2366, + "time_per_iteration": 2.727456569671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110146, + "balance_loss_mlp": 1.0928843, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.06525288256406295, + "language_loss": 0.87264466, + "learning_rate": 0.0005959786912520558, + "loss": 0.88374615, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.17285156, + "step": 2367, + "time_per_iteration": 2.6637766361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.08999324, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.061777726879510934, + "language_loss": 0.8370434, + "learning_rate": 0.0005956729240013806, + "loss": 0.84811896, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.17565918, + "step": 2368, + "time_per_iteration": 2.815329074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107603, + "balance_loss_mlp": 1.08967423, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.07604266440979088, + "language_loss": 0.91824389, + "learning_rate": 0.0005953671196071824, + "loss": 0.92931986, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.17944336, + "step": 2369, + "time_per_iteration": 2.711060047149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111288, + "balance_loss_mlp": 1.09501028, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.06552470471898014, + "language_loss": 0.80047917, + "learning_rate": 0.0005950612781881846, + "loss": 0.81160796, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.17871094, + "step": 2370, + "time_per_iteration": 2.710073709487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108328, + "balance_loss_mlp": 1.09072089, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.1576706166146413, + "language_loss": 0.75711769, + "learning_rate": 0.0005947553998631259, + "loss": 0.76820099, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.1763916, + "step": 2371, + "time_per_iteration": 2.855384588241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098527, + "balance_loss_mlp": 1.08041906, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.056716395855559716, + "language_loss": 0.78911364, + "learning_rate": 0.000594449484750758, + "loss": 0.8000989, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.18127441, + "step": 2372, + "time_per_iteration": 4.694324493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095337, + "balance_loss_mlp": 1.07693148, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.07402703052898342, + "language_loss": 0.82845718, + "learning_rate": 0.0005941435329698484, + "loss": 0.83941054, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.18395996, + "step": 2373, + "time_per_iteration": 2.677161693572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094792, + "balance_loss_mlp": 1.07592094, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.07242003224557565, + "language_loss": 0.82777703, + "learning_rate": 0.0005938375446391778, + "loss": 0.83872497, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.18847656, + "step": 2374, + "time_per_iteration": 2.6986706256866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094981, + "balance_loss_mlp": 1.07626557, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.09602017850343586, + "language_loss": 0.88724887, + "learning_rate": 0.0005935315198775415, + "loss": 0.89819872, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.18713379, + "step": 2375, + "time_per_iteration": 2.6160995960235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097379, + "balance_loss_mlp": 1.07811522, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.07644315743317759, + "language_loss": 0.86640108, + "learning_rate": 0.0005932254588037486, + "loss": 0.87737489, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.19262695, + "step": 2376, + "time_per_iteration": 2.5169382095336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097089, + "balance_loss_mlp": 1.07751513, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.07850584285058836, + "language_loss": 0.86183727, + "learning_rate": 0.000592919361536623, + "loss": 0.87280822, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.19580078, + "step": 2377, + "time_per_iteration": 2.668555498123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099166, + "balance_loss_mlp": 1.07996106, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.07491389260925961, + "language_loss": 0.89019889, + "learning_rate": 0.0005926132281950017, + "loss": 0.90119052, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.19213867, + "step": 2378, + "time_per_iteration": 2.7553632259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098644, + "balance_loss_mlp": 1.07934439, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.07088499852096378, + "language_loss": 0.84996307, + "learning_rate": 0.0005923070588977367, + "loss": 0.86094952, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.19287109, + "step": 2379, + "time_per_iteration": 2.8268253803253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105163, + "balance_loss_mlp": 1.08666205, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.08663232567685626, + "language_loss": 0.85752875, + "learning_rate": 0.0005920008537636931, + "loss": 0.86858034, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.18493652, + "step": 2380, + "time_per_iteration": 2.9154610633850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111966, + "balance_loss_mlp": 1.09322584, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.06304298978525442, + "language_loss": 0.86810696, + "learning_rate": 0.0005916946129117504, + "loss": 0.87922657, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.18725586, + "step": 2381, + "time_per_iteration": 2.9332261085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116157, + "balance_loss_mlp": 1.09857368, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.07662767679861947, + "language_loss": 0.81012738, + "learning_rate": 0.0005913883364608017, + "loss": 0.821289, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.17602539, + "step": 2382, + "time_per_iteration": 3.0999624729156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122668, + "balance_loss_mlp": 1.104954, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.07647659587343762, + "language_loss": 0.88500929, + "learning_rate": 0.0005910820245297542, + "loss": 0.89623594, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.17724609, + "step": 2383, + "time_per_iteration": 2.8880879878997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124722, + "balance_loss_mlp": 1.10707903, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.0900951330432027, + "language_loss": 0.80609989, + "learning_rate": 0.000590775677237529, + "loss": 0.81734717, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.17651367, + "step": 2384, + "time_per_iteration": 2.758249044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133426, + "balance_loss_mlp": 1.11639071, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.08076424564900554, + "language_loss": 0.79984713, + "learning_rate": 0.0005904692947030601, + "loss": 0.81118137, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.17053223, + "step": 2385, + "time_per_iteration": 2.667224168777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129309, + "balance_loss_mlp": 1.11242914, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.10079326608985974, + "language_loss": 0.89998889, + "learning_rate": 0.0005901628770452963, + "loss": 0.91128194, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.16894531, + "step": 2386, + "time_per_iteration": 2.5951790809631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129955, + "balance_loss_mlp": 1.1131345, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.06835358350560915, + "language_loss": 0.87016714, + "learning_rate": 0.000589856424383199, + "loss": 0.88146669, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.16833496, + "step": 2387, + "time_per_iteration": 2.6031622886657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.1086055, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.07768127603303249, + "language_loss": 0.82945853, + "learning_rate": 0.000589549936835744, + "loss": 0.84071612, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.17175293, + "step": 2388, + "time_per_iteration": 2.903437376022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112224, + "balance_loss_mlp": 1.10476351, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06100287690428954, + "language_loss": 0.78894806, + "learning_rate": 0.0005892434145219202, + "loss": 0.80017042, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.17504883, + "step": 2389, + "time_per_iteration": 2.61372709274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104771, + "balance_loss_mlp": 1.08758104, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.07434011004541237, + "language_loss": 0.8214674, + "learning_rate": 0.0005889368575607303, + "loss": 0.83251518, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.17211914, + "step": 2390, + "time_per_iteration": 2.894376039505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113067, + "balance_loss_mlp": 1.09576964, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.08125857985315703, + "language_loss": 0.78747576, + "learning_rate": 0.00058863026607119, + "loss": 0.79860646, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.1730957, + "step": 2391, + "time_per_iteration": 3.112093210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118062, + "balance_loss_mlp": 1.10093117, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.08788037013511367, + "language_loss": 0.7955699, + "learning_rate": 0.0005883236401723287, + "loss": 0.80675054, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.17150879, + "step": 2392, + "time_per_iteration": 3.242553472518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09295249, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.08816777762822899, + "language_loss": 0.84516722, + "learning_rate": 0.0005880169799831893, + "loss": 0.8562752, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.17858887, + "step": 2393, + "time_per_iteration": 2.6654422283172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098957, + "balance_loss_mlp": 1.08111119, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.10997873970116459, + "language_loss": 0.81234348, + "learning_rate": 0.0005877102856228278, + "loss": 0.82333302, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.17858887, + "step": 2394, + "time_per_iteration": 2.873918294906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103019, + "balance_loss_mlp": 1.08542323, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.07484934817589016, + "language_loss": 0.84600067, + "learning_rate": 0.0005874035572103133, + "loss": 0.85703087, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.17602539, + "step": 2395, + "time_per_iteration": 2.6604511737823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106186, + "balance_loss_mlp": 1.08816206, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.09236346174205023, + "language_loss": 0.82285452, + "learning_rate": 0.0005870967948647288, + "loss": 0.83391643, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.18041992, + "step": 2396, + "time_per_iteration": 2.805236339569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088357, + "balance_loss_mlp": 1.0784868, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.0372592343397745, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75396657, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.09863281, + "step": 2397, + "time_per_iteration": 5.380864143371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114264, + "balance_loss_mlp": 1.09671664, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.08046670019017348, + "language_loss": 0.85787129, + "learning_rate": 0.0005864831688507443, + "loss": 0.86901391, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.17553711, + "step": 2398, + "time_per_iteration": 3.1147820949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119108, + "balance_loss_mlp": 1.10053492, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.08636966322347801, + "language_loss": 0.75248241, + "learning_rate": 0.0005861763054205754, + "loss": 0.76367348, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.18566895, + "step": 2399, + "time_per_iteration": 2.787648916244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126018, + "balance_loss_mlp": 1.10773087, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.07252969708721291, + "language_loss": 0.80419457, + "learning_rate": 0.0005858694085337976, + "loss": 0.81545472, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.18273926, + "step": 2400, + "time_per_iteration": 2.859846591949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113297, + "balance_loss_mlp": 1.09409237, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.08888433146403377, + "language_loss": 0.83730817, + "learning_rate": 0.0005855624783095589, + "loss": 0.84844118, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.19189453, + "step": 2401, + "time_per_iteration": 2.5447638034820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107409, + "balance_loss_mlp": 1.08806109, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.06969383703523749, + "language_loss": 0.85055763, + "learning_rate": 0.00058525551486702, + "loss": 0.86163163, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.19335938, + "step": 2402, + "time_per_iteration": 2.5561320781707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099434, + "balance_loss_mlp": 1.08090901, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.0974904106662223, + "language_loss": 0.80911911, + "learning_rate": 0.0005849485183253548, + "loss": 0.82011348, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.18530273, + "step": 2403, + "time_per_iteration": 2.6459126472473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099055, + "balance_loss_mlp": 1.08017266, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.06563821415676413, + "language_loss": 0.87331611, + "learning_rate": 0.0005846414888037501, + "loss": 0.88430667, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.18896484, + "step": 2404, + "time_per_iteration": 2.5333003997802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091218, + "balance_loss_mlp": 1.07249045, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.06903002712252786, + "language_loss": 0.82273191, + "learning_rate": 0.0005843344264214049, + "loss": 0.83364403, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.18701172, + "step": 2405, + "time_per_iteration": 2.806748628616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103621, + "balance_loss_mlp": 1.08491707, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.07210099338506677, + "language_loss": 0.84715909, + "learning_rate": 0.0005840273312975317, + "loss": 0.8581953, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.18701172, + "step": 2406, + "time_per_iteration": 2.884800910949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113888, + "balance_loss_mlp": 1.09550619, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.08103405236073111, + "language_loss": 0.90235025, + "learning_rate": 0.0005837202035513555, + "loss": 0.9134891, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.18383789, + "step": 2407, + "time_per_iteration": 2.609774351119995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114389, + "balance_loss_mlp": 1.09645963, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.08825825577707168, + "language_loss": 0.81317043, + "learning_rate": 0.0005834130433021136, + "loss": 0.8243143, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.17932129, + "step": 2408, + "time_per_iteration": 2.775449514389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109877, + "balance_loss_mlp": 1.09179258, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.07528135433799624, + "language_loss": 0.73480821, + "learning_rate": 0.0005831058506690563, + "loss": 0.74590695, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.18078613, + "step": 2409, + "time_per_iteration": 2.675328254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104926, + "balance_loss_mlp": 1.08739018, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.06500990989470928, + "language_loss": 0.85772568, + "learning_rate": 0.0005827986257714464, + "loss": 0.86877489, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.17541504, + "step": 2410, + "time_per_iteration": 2.934680461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106367, + "balance_loss_mlp": 1.0885216, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.1078033090301908, + "language_loss": 0.88550043, + "learning_rate": 0.0005824913687285591, + "loss": 0.89656413, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.17858887, + "step": 2411, + "time_per_iteration": 2.74306058883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101516, + "balance_loss_mlp": 1.08387256, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.08594294380237487, + "language_loss": 0.81337988, + "learning_rate": 0.0005821840796596821, + "loss": 0.82439506, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.17663574, + "step": 2412, + "time_per_iteration": 2.7274651527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105841, + "balance_loss_mlp": 1.08832955, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.05827694326073197, + "language_loss": 0.80418169, + "learning_rate": 0.0005818767586841158, + "loss": 0.81524014, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.1751709, + "step": 2413, + "time_per_iteration": 2.779078722000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109963, + "balance_loss_mlp": 1.09252286, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.06834094492641501, + "language_loss": 0.86072665, + "learning_rate": 0.0005815694059211726, + "loss": 0.87182629, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.17456055, + "step": 2414, + "time_per_iteration": 2.7060773372650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022324, + "balance_loss_mlp": 1.01297832, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.02599871836797638, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81895959, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.09326172, + "step": 2415, + "time_per_iteration": 4.83809757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018248, + "balance_loss_mlp": 1.00894976, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.022144294594628845, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.7796331, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.09277344, + "step": 2416, + "time_per_iteration": 4.993790626525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135903, + "balance_loss_mlp": 1.11812854, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.10260058932365836, + "language_loss": 0.862611, + "learning_rate": 0.0005806471581013931, + "loss": 0.87397003, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.17785645, + "step": 2417, + "time_per_iteration": 2.689473867416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_mlp": 1.12353921, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.08959237751331865, + "language_loss": 0.78271216, + "learning_rate": 0.0005803396793823146, + "loss": 0.79413325, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.18579102, + "step": 2418, + "time_per_iteration": 2.8183717727661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126782, + "balance_loss_mlp": 1.10836434, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.10270562971795844, + "language_loss": 0.85666251, + "learning_rate": 0.0005800321694726065, + "loss": 0.86793029, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.1842041, + "step": 2419, + "time_per_iteration": 2.797482490539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116623, + "balance_loss_mlp": 1.09855139, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.0731094360896604, + "language_loss": 0.86679709, + "learning_rate": 0.0005797246284916545, + "loss": 0.8779633, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.1809082, + "step": 2420, + "time_per_iteration": 2.707942008972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054766, + "balance_loss_mlp": 1.04570651, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.038938158808133214, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78559953, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.09082031, + "step": 2421, + "time_per_iteration": 4.987195253372192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094388, + "balance_loss_mlp": 1.07617295, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.09940681141683862, + "language_loss": 0.87739611, + "learning_rate": 0.0005791094537936233, + "loss": 0.88833994, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.18237305, + "step": 2422, + "time_per_iteration": 2.7631046772003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091469, + "balance_loss_mlp": 1.07345629, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.06779180589479097, + "language_loss": 0.8166219, + "learning_rate": 0.0005788018203153762, + "loss": 0.82753664, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.18017578, + "step": 2423, + "time_per_iteration": 2.6615488529205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085208, + "balance_loss_mlp": 1.06742215, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.08426811135055082, + "language_loss": 0.85527384, + "learning_rate": 0.000578494156243549, + "loss": 0.86612594, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.17810059, + "step": 2424, + "time_per_iteration": 2.6183924674987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089098, + "balance_loss_mlp": 1.07045364, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.08457394710823794, + "language_loss": 0.89275956, + "learning_rate": 0.0005781864616975878, + "loss": 0.90365046, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.18640137, + "step": 2425, + "time_per_iteration": 2.6595993041992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096807, + "balance_loss_mlp": 1.07906842, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.0955155738973633, + "language_loss": 0.84080482, + "learning_rate": 0.0005778787367969502, + "loss": 0.8517729, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.17749023, + "step": 2426, + "time_per_iteration": 2.573312759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010984, + "balance_loss_mlp": 1.08017302, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.07224995984565184, + "language_loss": 0.81008911, + "learning_rate": 0.0005775709816611053, + "loss": 0.82107311, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.18237305, + "step": 2427, + "time_per_iteration": 2.9737117290496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096193, + "balance_loss_mlp": 1.07804918, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.0630888064205099, + "language_loss": 0.83649611, + "learning_rate": 0.0005772631964095346, + "loss": 0.84745806, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.18151855, + "step": 2428, + "time_per_iteration": 2.7121798992156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108204, + "balance_loss_mlp": 1.09003639, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.07098479359046088, + "language_loss": 0.85673976, + "learning_rate": 0.000576955381161731, + "loss": 0.86782181, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.18164062, + "step": 2429, + "time_per_iteration": 2.7059943675994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102277, + "balance_loss_mlp": 1.08414483, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.07900180679196234, + "language_loss": 0.86017609, + "learning_rate": 0.0005766475360371985, + "loss": 0.87119883, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.18115234, + "step": 2430, + "time_per_iteration": 2.5818653106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106318, + "balance_loss_mlp": 1.08826935, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.07907770586360956, + "language_loss": 0.8455205, + "learning_rate": 0.0005763396611554536, + "loss": 0.85658371, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.18066406, + "step": 2431, + "time_per_iteration": 2.6773664951324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109521, + "balance_loss_mlp": 1.09193754, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.09111480047327246, + "language_loss": 0.79973984, + "learning_rate": 0.0005760317566360237, + "loss": 0.81083506, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.17602539, + "step": 2432, + "time_per_iteration": 3.014580726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114014, + "balance_loss_mlp": 1.09622765, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.0789075933194326, + "language_loss": 0.85020924, + "learning_rate": 0.000575723822598448, + "loss": 0.86134946, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.17785645, + "step": 2433, + "time_per_iteration": 2.8005478382110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111562, + "balance_loss_mlp": 1.09765542, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.07367233066443238, + "language_loss": 0.8147794, + "learning_rate": 0.0005754158591622773, + "loss": 0.82593554, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.1796875, + "step": 2434, + "time_per_iteration": 3.0118775367736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011158, + "balance_loss_mlp": 1.09752536, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.07922373152064655, + "language_loss": 0.82327235, + "learning_rate": 0.0005751078664470732, + "loss": 0.83443034, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.18286133, + "step": 2435, + "time_per_iteration": 2.5390684604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116733, + "balance_loss_mlp": 1.09935236, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.07859313369065737, + "language_loss": 0.85868919, + "learning_rate": 0.0005747998445724094, + "loss": 0.86985648, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.17382812, + "step": 2436, + "time_per_iteration": 2.6606297492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112946, + "balance_loss_mlp": 1.11235368, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.10622400322266522, + "language_loss": 0.8919673, + "learning_rate": 0.0005744917936578707, + "loss": 0.90326178, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.17126465, + "step": 2437, + "time_per_iteration": 2.8204565048217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121886, + "balance_loss_mlp": 1.10436273, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.06508472909978535, + "language_loss": 0.8377744, + "learning_rate": 0.0005741837138230526, + "loss": 0.8489933, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.17553711, + "step": 2438, + "time_per_iteration": 2.781350612640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122059, + "balance_loss_mlp": 1.10464203, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.06834159619761165, + "language_loss": 0.86276829, + "learning_rate": 0.0005738756051875627, + "loss": 0.87398893, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.17431641, + "step": 2439, + "time_per_iteration": 3.121708631515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131074, + "balance_loss_mlp": 1.11383653, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.07303953933220877, + "language_loss": 0.82923281, + "learning_rate": 0.0005735674678710192, + "loss": 0.84054363, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.17260742, + "step": 2440, + "time_per_iteration": 2.749302864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122683, + "balance_loss_mlp": 1.1051836, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.1547549936477752, + "language_loss": 0.80928504, + "learning_rate": 0.0005732593019930517, + "loss": 0.82051194, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.17504883, + "step": 2441, + "time_per_iteration": 2.9091122150421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137766, + "balance_loss_mlp": 1.12098181, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.0743256165664551, + "language_loss": 0.87914228, + "learning_rate": 0.0005729511076733008, + "loss": 0.89051992, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.16796875, + "step": 2442, + "time_per_iteration": 2.728706121444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140336, + "balance_loss_mlp": 1.12288404, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.07419109808583535, + "language_loss": 0.84796697, + "learning_rate": 0.000572642885031418, + "loss": 0.85937035, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.17456055, + "step": 2443, + "time_per_iteration": 2.8746440410614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134354, + "balance_loss_mlp": 1.11715245, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.10756822588652355, + "language_loss": 0.80578518, + "learning_rate": 0.0005723346341870662, + "loss": 0.81712866, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.17224121, + "step": 2444, + "time_per_iteration": 2.740504741668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114197, + "balance_loss_mlp": 1.12406492, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.12204296392179416, + "language_loss": 0.86163437, + "learning_rate": 0.0005720263552599188, + "loss": 0.87305409, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.17907715, + "step": 2445, + "time_per_iteration": 2.489807367324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112646, + "balance_loss_mlp": 1.10888886, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.08439630255123334, + "language_loss": 0.79720879, + "learning_rate": 0.0005717180483696604, + "loss": 0.80847341, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.17590332, + "step": 2446, + "time_per_iteration": 2.9626049995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113574, + "balance_loss_mlp": 1.09573984, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.0764291785045912, + "language_loss": 0.83012414, + "learning_rate": 0.0005714097136359862, + "loss": 0.84125984, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.17822266, + "step": 2447, + "time_per_iteration": 2.6736068725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105721, + "balance_loss_mlp": 1.08789945, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.08513203657143086, + "language_loss": 0.86345923, + "learning_rate": 0.0005711013511786027, + "loss": 0.87451649, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.1784668, + "step": 2448, + "time_per_iteration": 2.7899038791656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096309, + "balance_loss_mlp": 1.07914329, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.06769719009727464, + "language_loss": 0.83320636, + "learning_rate": 0.0005707929611172263, + "loss": 0.8441695, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.171875, + "step": 2449, + "time_per_iteration": 2.7302591800689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094917, + "balance_loss_mlp": 1.07738137, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.0952592580133139, + "language_loss": 0.83792615, + "learning_rate": 0.000570484543571585, + "loss": 0.84887528, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.17553711, + "step": 2450, + "time_per_iteration": 2.553699254989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091615, + "balance_loss_mlp": 1.07405567, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.09253179962645706, + "language_loss": 0.82604945, + "learning_rate": 0.0005701760986614171, + "loss": 0.83696556, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.17578125, + "step": 2451, + "time_per_iteration": 2.5708320140838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084195, + "balance_loss_mlp": 1.06648016, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.09280751659958478, + "language_loss": 0.87434494, + "learning_rate": 0.0005698676265064714, + "loss": 0.88518691, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.17736816, + "step": 2452, + "time_per_iteration": 2.505521297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.06540704, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.08307061480415358, + "language_loss": 0.88798922, + "learning_rate": 0.0005695591272265074, + "loss": 0.89882344, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.18017578, + "step": 2453, + "time_per_iteration": 2.5634660720825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091392, + "balance_loss_mlp": 1.07360613, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.09129518334944925, + "language_loss": 0.81819969, + "learning_rate": 0.0005692506009412954, + "loss": 0.8291136, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.17797852, + "step": 2454, + "time_per_iteration": 2.740715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094643, + "balance_loss_mlp": 1.08458209, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.045004720534391626, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78645909, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.10058594, + "step": 2455, + "time_per_iteration": 4.978295803070068 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110787, + "balance_loss_mlp": 1.08972645, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07943806135548723, + "language_loss": 0.89481127, + "learning_rate": 0.0005686334678342593, + "loss": 0.90588999, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.18151855, + "step": 2456, + "time_per_iteration": 2.9444401264190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124124, + "balance_loss_mlp": 1.10643291, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.08486852653668125, + "language_loss": 0.81272578, + "learning_rate": 0.0005683248612520274, + "loss": 0.8239671, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.17700195, + "step": 2457, + "time_per_iteration": 3.1061813831329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113744, + "balance_loss_mlp": 1.11931992, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.11516736159890015, + "language_loss": 0.83477956, + "learning_rate": 0.0005680162281437321, + "loss": 0.84615391, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.18115234, + "step": 2458, + "time_per_iteration": 2.929063558578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148235, + "balance_loss_mlp": 1.13042545, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.07751254840004482, + "language_loss": 0.84309924, + "learning_rate": 0.000567707568629195, + "loss": 0.85458159, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.17810059, + "step": 2459, + "time_per_iteration": 2.7221994400024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.12910485, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.08725044616859287, + "language_loss": 0.81842762, + "learning_rate": 0.0005673988828282486, + "loss": 0.82989782, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.17932129, + "step": 2460, + "time_per_iteration": 2.7002882957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137284, + "balance_loss_mlp": 1.11850882, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.08215342810100013, + "language_loss": 0.80515504, + "learning_rate": 0.0005670901708607352, + "loss": 0.8165279, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.1875, + "step": 2461, + "time_per_iteration": 2.9950685501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118533, + "balance_loss_mlp": 1.09990108, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.10884730986404606, + "language_loss": 0.83628744, + "learning_rate": 0.0005667814328465076, + "loss": 0.84747279, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.1862793, + "step": 2462, + "time_per_iteration": 2.645465612411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.09035087, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.09091581525952792, + "language_loss": 0.81654978, + "learning_rate": 0.0005664726689054285, + "loss": 0.82763606, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.18261719, + "step": 2463, + "time_per_iteration": 2.4545066356658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104041, + "balance_loss_mlp": 1.08579004, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.07864824239143242, + "language_loss": 0.80990708, + "learning_rate": 0.0005661638791573704, + "loss": 0.82094747, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.18237305, + "step": 2464, + "time_per_iteration": 2.734745502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108445, + "balance_loss_mlp": 1.08969331, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.0786760499807007, + "language_loss": 0.86728454, + "learning_rate": 0.0005658550637222164, + "loss": 0.87836903, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.1875, + "step": 2465, + "time_per_iteration": 2.6243197917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098109, + "balance_loss_mlp": 1.07942867, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.07656108123336647, + "language_loss": 0.82025492, + "learning_rate": 0.0005655462227198592, + "loss": 0.831236, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.18676758, + "step": 2466, + "time_per_iteration": 2.9340949058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090686, + "balance_loss_mlp": 1.0713619, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.08929128939464244, + "language_loss": 0.84165299, + "learning_rate": 0.0005652373562702016, + "loss": 0.8525598, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.19311523, + "step": 2467, + "time_per_iteration": 2.6669704914093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088642, + "balance_loss_mlp": 1.07042646, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.09740211929478898, + "language_loss": 0.88243479, + "learning_rate": 0.000564928464493156, + "loss": 0.89332116, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.18212891, + "step": 2468, + "time_per_iteration": 2.5501999855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_mlp": 1.06571448, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.10206964777214489, + "language_loss": 0.8130033, + "learning_rate": 0.000564619547508645, + "loss": 0.82383919, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.17907715, + "step": 2469, + "time_per_iteration": 3.1110846996307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080615, + "balance_loss_mlp": 1.0618155, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.10847559686300064, + "language_loss": 0.83074248, + "learning_rate": 0.0005643106054366008, + "loss": 0.84154862, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.18798828, + "step": 2470, + "time_per_iteration": 2.5955324172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082834, + "balance_loss_mlp": 1.0653584, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.07776069310312227, + "language_loss": 0.78943384, + "learning_rate": 0.000564001638396965, + "loss": 0.80026221, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.17492676, + "step": 2471, + "time_per_iteration": 2.7306296825408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090328, + "balance_loss_mlp": 1.07253027, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.0797482134953605, + "language_loss": 0.81547666, + "learning_rate": 0.0005636926465096897, + "loss": 0.8263799, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.17810059, + "step": 2472, + "time_per_iteration": 3.059279203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112, + "balance_loss_mlp": 1.09371316, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.08460495515925144, + "language_loss": 0.87285447, + "learning_rate": 0.0005633836298947363, + "loss": 0.88397449, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.18286133, + "step": 2473, + "time_per_iteration": 2.6521553993225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122658, + "balance_loss_mlp": 1.10413289, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.09203669339342216, + "language_loss": 0.70590854, + "learning_rate": 0.000563074588672075, + "loss": 0.71713507, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.18530273, + "step": 2474, + "time_per_iteration": 2.7375221252441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113642, + "balance_loss_mlp": 1.11839581, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.0857314817059495, + "language_loss": 0.8500272, + "learning_rate": 0.0005627655229616868, + "loss": 0.86139143, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.18029785, + "step": 2475, + "time_per_iteration": 2.7078299522399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128625, + "balance_loss_mlp": 1.11030293, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.07963853645873449, + "language_loss": 0.89927155, + "learning_rate": 0.0005624564328835616, + "loss": 0.91055775, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.18334961, + "step": 2476, + "time_per_iteration": 2.8388264179229736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117766, + "balance_loss_mlp": 1.09916914, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.07471116365669703, + "language_loss": 0.83945388, + "learning_rate": 0.0005621473185576986, + "loss": 0.85063154, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.18579102, + "step": 2477, + "time_per_iteration": 2.7755634784698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.09451878, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.10765434361010802, + "language_loss": 0.87517297, + "learning_rate": 0.0005618381801041068, + "loss": 0.88629925, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.18115234, + "step": 2478, + "time_per_iteration": 2.6078171730041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110924, + "balance_loss_mlp": 1.0912751, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.09054531696498577, + "language_loss": 0.8286736, + "learning_rate": 0.0005615290176428044, + "loss": 0.83976603, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.17980957, + "step": 2479, + "time_per_iteration": 2.658313035964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093873, + "balance_loss_mlp": 1.07611132, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.07218164617984826, + "language_loss": 0.85039639, + "learning_rate": 0.0005612198312938187, + "loss": 0.8613351, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.17773438, + "step": 2480, + "time_per_iteration": 2.7423031330108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07839966, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.08183869789897112, + "language_loss": 0.79371572, + "learning_rate": 0.0005609106211771868, + "loss": 0.80467397, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.17443848, + "step": 2481, + "time_per_iteration": 2.888284921646118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098997, + "balance_loss_mlp": 1.08134174, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.07799032438633784, + "language_loss": 0.89138782, + "learning_rate": 0.0005606013874129543, + "loss": 0.90237772, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.17675781, + "step": 2482, + "time_per_iteration": 2.8308520317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096101, + "balance_loss_mlp": 1.07892263, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.06912495328146803, + "language_loss": 0.79914749, + "learning_rate": 0.0005602921301211768, + "loss": 0.81010854, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.17199707, + "step": 2483, + "time_per_iteration": 2.745229721069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092062, + "balance_loss_mlp": 1.07441866, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.08947954354315603, + "language_loss": 0.8218801, + "learning_rate": 0.0005599828494219185, + "loss": 0.83280063, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.1763916, + "step": 2484, + "time_per_iteration": 2.5549302101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.07945359, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.09532235552733567, + "language_loss": 0.8879438, + "learning_rate": 0.0005596735454352527, + "loss": 0.89891142, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.17333984, + "step": 2485, + "time_per_iteration": 2.8665127754211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094881, + "balance_loss_mlp": 1.07777441, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.09434748219243295, + "language_loss": 0.85316986, + "learning_rate": 0.0005593642182812619, + "loss": 0.8641187, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.17126465, + "step": 2486, + "time_per_iteration": 2.6778790950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094993, + "balance_loss_mlp": 1.07798147, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.07207308279854807, + "language_loss": 0.83091319, + "learning_rate": 0.0005590548680800378, + "loss": 0.84186316, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.17028809, + "step": 2487, + "time_per_iteration": 3.121678590774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100078, + "balance_loss_mlp": 1.08330488, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.0688175569320757, + "language_loss": 0.76333058, + "learning_rate": 0.0005587454949516804, + "loss": 0.77433127, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.16784668, + "step": 2488, + "time_per_iteration": 2.7487144470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109664, + "balance_loss_mlp": 1.09223557, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.0791895688664035, + "language_loss": 0.87661278, + "learning_rate": 0.0005584360990162993, + "loss": 0.88770944, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.17443848, + "step": 2489, + "time_per_iteration": 2.6889615058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.08878708, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.06381910852284944, + "language_loss": 0.85160542, + "learning_rate": 0.0005581266803940124, + "loss": 0.8626619, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.16870117, + "step": 2490, + "time_per_iteration": 2.752704381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108064, + "balance_loss_mlp": 1.09077895, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.06997425176776657, + "language_loss": 0.87046134, + "learning_rate": 0.0005578172392049471, + "loss": 0.88154197, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.17297363, + "step": 2491, + "time_per_iteration": 2.744326114654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113808, + "balance_loss_mlp": 1.09704673, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.0919919864780235, + "language_loss": 0.84245729, + "learning_rate": 0.0005575077755692386, + "loss": 0.85359544, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.16760254, + "step": 2492, + "time_per_iteration": 2.829349994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106621, + "balance_loss_mlp": 1.08978891, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.07193820952165939, + "language_loss": 0.85866803, + "learning_rate": 0.0005571982896070316, + "loss": 0.86973423, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.16845703, + "step": 2493, + "time_per_iteration": 2.6917920112609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111679, + "balance_loss_mlp": 1.09457207, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.08033850408937983, + "language_loss": 0.89604986, + "learning_rate": 0.0005568887814384792, + "loss": 0.9071666, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.17114258, + "step": 2494, + "time_per_iteration": 2.569196939468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106396, + "balance_loss_mlp": 1.08963561, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.07662616215624289, + "language_loss": 0.87274265, + "learning_rate": 0.000556579251183743, + "loss": 0.88380659, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.16772461, + "step": 2495, + "time_per_iteration": 4.119016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.09276271, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.07795098880988466, + "language_loss": 0.79870969, + "learning_rate": 0.0005562696989629936, + "loss": 0.80980641, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.16918945, + "step": 2496, + "time_per_iteration": 2.780027151107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112455, + "balance_loss_mlp": 1.09557533, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.068284016634177, + "language_loss": 0.82789242, + "learning_rate": 0.0005559601248964095, + "loss": 0.83901697, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.16894531, + "step": 2497, + "time_per_iteration": 2.653590202331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110865, + "balance_loss_mlp": 1.09190154, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.10697304585744172, + "language_loss": 0.85506153, + "learning_rate": 0.0005556505291041783, + "loss": 0.86614799, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.16760254, + "step": 2498, + "time_per_iteration": 2.720294952392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106549, + "balance_loss_mlp": 1.08972836, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.0621998173583794, + "language_loss": 0.84237647, + "learning_rate": 0.0005553409117064954, + "loss": 0.85344195, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.16833496, + "step": 2499, + "time_per_iteration": 2.9154043197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119945, + "balance_loss_mlp": 1.10298109, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.07282479458874046, + "language_loss": 0.84656966, + "learning_rate": 0.0005550312728235654, + "loss": 0.85776907, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.16967773, + "step": 2500, + "time_per_iteration": 2.700421094894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110812, + "balance_loss_mlp": 1.09159744, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.08404220746537734, + "language_loss": 0.83821297, + "learning_rate": 0.0005547216125756003, + "loss": 0.84929419, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.1652832, + "step": 2501, + "time_per_iteration": 2.7834067344665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106276, + "balance_loss_mlp": 1.08955085, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.07639679647694927, + "language_loss": 0.81906044, + "learning_rate": 0.0005544119310828211, + "loss": 0.83012319, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.16723633, + "step": 2502, + "time_per_iteration": 3.116422414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.09020913, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.07431223188319182, + "language_loss": 0.84573793, + "learning_rate": 0.0005541022284654568, + "loss": 0.85680836, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.16845703, + "step": 2503, + "time_per_iteration": 2.9265871047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110287, + "balance_loss_mlp": 1.08615696, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.06355297884535237, + "language_loss": 0.83910048, + "learning_rate": 0.0005537925048437446, + "loss": 0.85012925, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.16723633, + "step": 2504, + "time_per_iteration": 2.6517508029937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087379, + "balance_loss_mlp": 1.07774711, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.041815183909307344, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76838851, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.09619141, + "step": 2505, + "time_per_iteration": 4.958322048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105183, + "balance_loss_mlp": 1.08836293, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.060666396845578126, + "language_loss": 0.88195586, + "learning_rate": 0.0005531729950682664, + "loss": 0.8930077, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.16833496, + "step": 2506, + "time_per_iteration": 3.0288734436035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_mlp": 1.08631384, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.10090208417938805, + "language_loss": 0.84562349, + "learning_rate": 0.000552863209155015, + "loss": 0.85666019, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.17382812, + "step": 2507, + "time_per_iteration": 2.503030300140381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104399, + "balance_loss_mlp": 1.0873642, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.0644343170841742, + "language_loss": 0.82010555, + "learning_rate": 0.0005525534027184461, + "loss": 0.83114958, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.17053223, + "step": 2508, + "time_per_iteration": 2.563375949859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115093, + "balance_loss_mlp": 1.09834397, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.20306769309253048, + "language_loss": 0.82742786, + "learning_rate": 0.0005522435758788365, + "loss": 0.83857882, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.16760254, + "step": 2509, + "time_per_iteration": 2.773317813873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107185, + "balance_loss_mlp": 1.08974481, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.08084829795782655, + "language_loss": 0.80297685, + "learning_rate": 0.0005519337287564721, + "loss": 0.81404877, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.17468262, + "step": 2510, + "time_per_iteration": 2.8417367935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109893, + "balance_loss_mlp": 1.09273911, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07005467856459312, + "language_loss": 0.83318454, + "learning_rate": 0.000551623861471646, + "loss": 0.84428346, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.17175293, + "step": 2511, + "time_per_iteration": 4.144210577011108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_mlp": 1.02186131, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.022823457387693702, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79850423, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.09716797, + "step": 2512, + "time_per_iteration": 4.846112489700317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105484, + "balance_loss_mlp": 1.08805561, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.06582055063949785, + "language_loss": 0.86307418, + "learning_rate": 0.0005510040668958211, + "loss": 0.87412906, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.17443848, + "step": 2513, + "time_per_iteration": 2.5893678665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_mlp": 1.01802599, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.018178820637651416, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78788525, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.09912109, + "step": 2514, + "time_per_iteration": 4.883544445037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104231, + "balance_loss_mlp": 1.08638501, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.07451301520475437, + "language_loss": 0.83174801, + "learning_rate": 0.0005503841931138645, + "loss": 0.84279031, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.17858887, + "step": 2515, + "time_per_iteration": 2.6821184158325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099055, + "balance_loss_mlp": 1.0817579, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.1026377711865236, + "language_loss": 0.81650221, + "learning_rate": 0.0005500742268214025, + "loss": 0.82749277, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.17321777, + "step": 2516, + "time_per_iteration": 2.501392364501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094696, + "balance_loss_mlp": 1.07677877, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.06104395933883966, + "language_loss": 0.85527956, + "learning_rate": 0.0005497642410884014, + "loss": 0.86622655, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.17919922, + "step": 2517, + "time_per_iteration": 2.7879879474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092849, + "balance_loss_mlp": 1.07494426, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.0763804859448823, + "language_loss": 0.85418707, + "learning_rate": 0.0005494542360352085, + "loss": 0.86511558, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.17919922, + "step": 2518, + "time_per_iteration": 2.705934762954712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098599, + "balance_loss_mlp": 1.0811708, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.07348525281964927, + "language_loss": 0.855097, + "learning_rate": 0.0005491442117821783, + "loss": 0.86608291, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.17456055, + "step": 2519, + "time_per_iteration": 2.7056097984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097149, + "balance_loss_mlp": 1.07910061, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.07963371062569355, + "language_loss": 0.87741303, + "learning_rate": 0.0005488341684496732, + "loss": 0.88838446, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.18054199, + "step": 2520, + "time_per_iteration": 2.6991913318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107269, + "balance_loss_mlp": 1.08979297, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06522694836378315, + "language_loss": 0.91749704, + "learning_rate": 0.0005485241061580624, + "loss": 0.92856967, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.17480469, + "step": 2521, + "time_per_iteration": 2.751336097717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111149, + "balance_loss_mlp": 1.09335089, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.0788581364531382, + "language_loss": 0.84810591, + "learning_rate": 0.0005482140250277228, + "loss": 0.85921741, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.17797852, + "step": 2522, + "time_per_iteration": 3.012603759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116154, + "balance_loss_mlp": 1.09896421, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.081531881919659, + "language_loss": 0.87781787, + "learning_rate": 0.0005479039251790387, + "loss": 0.88897943, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.17211914, + "step": 2523, + "time_per_iteration": 2.6643292903900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115817, + "balance_loss_mlp": 1.0985198, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.1008566510750689, + "language_loss": 0.84847081, + "learning_rate": 0.0005475938067324014, + "loss": 0.85962898, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.1730957, + "step": 2524, + "time_per_iteration": 2.8631820678710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129536, + "balance_loss_mlp": 1.11252499, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.08592622698203999, + "language_loss": 0.83456719, + "learning_rate": 0.0005472836698082098, + "loss": 0.84586251, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.17028809, + "step": 2525, + "time_per_iteration": 2.5364460945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109588, + "balance_loss_mlp": 1.09244525, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.06952957834620052, + "language_loss": 0.8412683, + "learning_rate": 0.0005469735145268694, + "loss": 0.85236418, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.17138672, + "step": 2526, + "time_per_iteration": 2.766571283340454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106022, + "balance_loss_mlp": 1.08884394, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.07975413334667165, + "language_loss": 0.80809188, + "learning_rate": 0.0005466633410087933, + "loss": 0.81915212, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.171875, + "step": 2527, + "time_per_iteration": 2.738344192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072106, + "balance_loss_mlp": 1.06094766, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.03644390169401177, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78332925, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.11181641, + "step": 2528, + "time_per_iteration": 4.871282339096069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090172, + "balance_loss_mlp": 1.07268429, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.06987485087243678, + "language_loss": 0.87962806, + "learning_rate": 0.0005460429397441214, + "loss": 0.89052981, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.1751709, + "step": 2529, + "time_per_iteration": 2.589794635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097683, + "balance_loss_mlp": 1.08112478, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.08125917870845005, + "language_loss": 0.86507833, + "learning_rate": 0.0005457327122383866, + "loss": 0.87605512, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.16564941, + "step": 2530, + "time_per_iteration": 2.633769989013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024086, + "balance_loss_mlp": 1.01402473, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.019350247330642424, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75660574, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.10058594, + "step": 2531, + "time_per_iteration": 4.829160213470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_mlp": 1.09450376, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.07679109022151961, + "language_loss": 0.7589134, + "learning_rate": 0.0005451122040823244, + "loss": 0.77002603, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.16760254, + "step": 2532, + "time_per_iteration": 2.809295654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113381, + "balance_loss_mlp": 1.09582114, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.07652021477742418, + "language_loss": 0.76977062, + "learning_rate": 0.0005448019236728997, + "loss": 0.78090441, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.17565918, + "step": 2533, + "time_per_iteration": 2.889730930328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111676, + "balance_loss_mlp": 1.09540379, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.08912362185496442, + "language_loss": 0.84908152, + "learning_rate": 0.0005444916258698255, + "loss": 0.86019826, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.16271973, + "step": 2534, + "time_per_iteration": 2.6680796146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109562, + "balance_loss_mlp": 1.09297991, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.06587099405348051, + "language_loss": 0.85898745, + "learning_rate": 0.0005441813107935704, + "loss": 0.87008309, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.16589355, + "step": 2535, + "time_per_iteration": 2.708963394165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121617, + "balance_loss_mlp": 1.10494018, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.07506618076199813, + "language_loss": 0.856264, + "learning_rate": 0.0005438709785646091, + "loss": 0.86748016, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.16687012, + "step": 2536, + "time_per_iteration": 2.5794246196746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.0970813, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.06872348733444625, + "language_loss": 0.86540043, + "learning_rate": 0.0005435606293034234, + "loss": 0.87653565, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.16442871, + "step": 2537, + "time_per_iteration": 2.663050889968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116159, + "balance_loss_mlp": 1.0999465, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.09164692396838796, + "language_loss": 0.84696114, + "learning_rate": 0.0005432502631305016, + "loss": 0.85812277, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.16210938, + "step": 2538, + "time_per_iteration": 2.7034809589385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119353, + "balance_loss_mlp": 1.10295033, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.06227186407680876, + "language_loss": 0.82968855, + "learning_rate": 0.0005429398801663386, + "loss": 0.84088206, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.1640625, + "step": 2539, + "time_per_iteration": 3.0155930519104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120209, + "balance_loss_mlp": 1.10398471, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.10714048411465311, + "language_loss": 0.82757926, + "learning_rate": 0.0005426294805314355, + "loss": 0.83878136, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.16223145, + "step": 2540, + "time_per_iteration": 2.5441384315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115337, + "balance_loss_mlp": 1.09914827, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.08648554978838247, + "language_loss": 0.79954243, + "learning_rate": 0.0005423190643463003, + "loss": 0.81069577, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.16186523, + "step": 2541, + "time_per_iteration": 2.992694854736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112211, + "balance_loss_mlp": 1.0954504, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.08541624697499144, + "language_loss": 0.82913029, + "learning_rate": 0.0005420086317314473, + "loss": 0.84025246, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.16772461, + "step": 2542, + "time_per_iteration": 2.658069133758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104299, + "balance_loss_mlp": 1.08720386, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.06935244738816776, + "language_loss": 0.80814946, + "learning_rate": 0.0005416981828073971, + "loss": 0.81919247, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.17102051, + "step": 2543, + "time_per_iteration": 2.818812608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_mlp": 1.02991831, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.020152649211275964, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78154421, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.09472656, + "step": 2544, + "time_per_iteration": 4.891278028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100982, + "balance_loss_mlp": 1.08363652, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.07927159683050183, + "language_loss": 0.85168952, + "learning_rate": 0.000541077236513819, + "loss": 0.86269933, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.17346191, + "step": 2545, + "time_per_iteration": 2.589184045791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094215, + "balance_loss_mlp": 1.07689393, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.06748793045052295, + "language_loss": 0.82038838, + "learning_rate": 0.0005407667393853638, + "loss": 0.83133048, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.17333984, + "step": 2546, + "time_per_iteration": 2.6306400299072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099707, + "balance_loss_mlp": 1.08196878, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.08073962926855084, + "language_loss": 0.83248717, + "learning_rate": 0.0005404562264298569, + "loss": 0.84348422, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.17749023, + "step": 2547, + "time_per_iteration": 2.890744209289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097618, + "balance_loss_mlp": 1.0795579, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.07477586030938296, + "language_loss": 0.83869213, + "learning_rate": 0.0005401456977678498, + "loss": 0.84966832, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.18078613, + "step": 2548, + "time_per_iteration": 2.691488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093416, + "balance_loss_mlp": 1.0753082, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.08381067722766777, + "language_loss": 0.77390134, + "learning_rate": 0.0005398351535199008, + "loss": 0.78483546, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.18103027, + "step": 2549, + "time_per_iteration": 3.0651490688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087007, + "balance_loss_mlp": 1.06931591, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.05957811074119609, + "language_loss": 0.83473563, + "learning_rate": 0.0005395245938065735, + "loss": 0.84560567, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.17712402, + "step": 2550, + "time_per_iteration": 2.7947916984558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085104, + "balance_loss_mlp": 1.06648386, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.10016911025461137, + "language_loss": 0.82528293, + "learning_rate": 0.0005392140187484379, + "loss": 0.83613402, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.18603516, + "step": 2551, + "time_per_iteration": 2.6254496574401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089241, + "balance_loss_mlp": 1.0698818, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.05979290752357133, + "language_loss": 0.89496678, + "learning_rate": 0.0005389034284660701, + "loss": 0.90585923, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.19348145, + "step": 2552, + "time_per_iteration": 2.8202950954437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096651, + "balance_loss_mlp": 1.07798314, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.09877873271676557, + "language_loss": 0.82097638, + "learning_rate": 0.000538592823080052, + "loss": 0.83194292, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.18676758, + "step": 2553, + "time_per_iteration": 3.156975507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092631, + "balance_loss_mlp": 1.07395101, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.1092160541841064, + "language_loss": 0.84523845, + "learning_rate": 0.000538282202710971, + "loss": 0.85616469, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.18664551, + "step": 2554, + "time_per_iteration": 2.5290331840515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109959, + "balance_loss_mlp": 1.08045673, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.10555847882945492, + "language_loss": 0.82219321, + "learning_rate": 0.000537971567479421, + "loss": 0.83318907, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.19128418, + "step": 2555, + "time_per_iteration": 2.755554437637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094808, + "balance_loss_mlp": 1.07547224, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.0816634604134734, + "language_loss": 0.87386465, + "learning_rate": 0.0005376609175060011, + "loss": 0.88481277, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.19311523, + "step": 2556, + "time_per_iteration": 2.6251890659332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088346, + "balance_loss_mlp": 1.06941605, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.1007754916439506, + "language_loss": 0.80408537, + "learning_rate": 0.0005373502529113162, + "loss": 0.81496882, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.18920898, + "step": 2557, + "time_per_iteration": 2.8081767559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080378, + "balance_loss_mlp": 1.06081533, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.09200682846254944, + "language_loss": 0.81391776, + "learning_rate": 0.0005370395738159773, + "loss": 0.82472152, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.19543457, + "step": 2558, + "time_per_iteration": 2.6609818935394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084764, + "balance_loss_mlp": 1.06559491, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.08064506015832804, + "language_loss": 0.82711154, + "learning_rate": 0.0005367288803406003, + "loss": 0.83795917, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.19165039, + "step": 2559, + "time_per_iteration": 2.644026756286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084251, + "balance_loss_mlp": 1.06544018, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.0889068964261426, + "language_loss": 0.81602907, + "learning_rate": 0.0005364181726058073, + "loss": 0.82687151, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.18798828, + "step": 2560, + "time_per_iteration": 2.7356274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082609, + "balance_loss_mlp": 1.06403637, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.0950227496854857, + "language_loss": 0.82278556, + "learning_rate": 0.0005361074507322261, + "loss": 0.83361161, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.18566895, + "step": 2561, + "time_per_iteration": 2.663046360015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06827641, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.07772582275378431, + "language_loss": 0.81617248, + "learning_rate": 0.000535796714840489, + "loss": 0.82704192, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.18664551, + "step": 2562, + "time_per_iteration": 2.638414144515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.07574439, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.08606941059340069, + "language_loss": 0.83548921, + "learning_rate": 0.0005354859650512348, + "loss": 0.84643233, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.18566895, + "step": 2563, + "time_per_iteration": 2.786123752593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103932, + "balance_loss_mlp": 1.08636093, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.10665890037430359, + "language_loss": 0.87337875, + "learning_rate": 0.0005351752014851074, + "loss": 0.88441813, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.17578125, + "step": 2564, + "time_per_iteration": 2.5858397483825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110675, + "balance_loss_mlp": 1.08847523, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.10057993561194663, + "language_loss": 0.83317149, + "learning_rate": 0.0005348644242627553, + "loss": 0.844239, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.1829834, + "step": 2565, + "time_per_iteration": 2.7638742923736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050217, + "balance_loss_mlp": 1.04082322, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.03479988729177956, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76336837, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.09375, + "step": 2566, + "time_per_iteration": 4.947393417358398 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106718, + "balance_loss_mlp": 1.08951592, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.06927642597141821, + "language_loss": 0.81322002, + "learning_rate": 0.0005342428293320013, + "loss": 0.82428724, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.17199707, + "step": 2567, + "time_per_iteration": 2.778985023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104881, + "balance_loss_mlp": 1.08785808, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.07155621127563581, + "language_loss": 0.83412832, + "learning_rate": 0.0005339320118649238, + "loss": 0.84517711, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.17041016, + "step": 2568, + "time_per_iteration": 2.7361106872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118929, + "balance_loss_mlp": 1.10148847, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.06786367407396048, + "language_loss": 0.86708534, + "learning_rate": 0.000533621181224271, + "loss": 0.87827462, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.17443848, + "step": 2569, + "time_per_iteration": 2.8056747913360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113987, + "balance_loss_mlp": 1.09679675, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.08062562134183447, + "language_loss": 0.81321245, + "learning_rate": 0.0005333103375307182, + "loss": 0.82435232, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.17211914, + "step": 2570, + "time_per_iteration": 2.904440402984619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114683, + "balance_loss_mlp": 1.09786248, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06756621210058887, + "language_loss": 0.8584491, + "learning_rate": 0.0005329994809049451, + "loss": 0.86959589, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.16833496, + "step": 2571, + "time_per_iteration": 2.8053295612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131581, + "balance_loss_mlp": 1.11458206, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.09358938815201079, + "language_loss": 0.87904042, + "learning_rate": 0.0005326886114676375, + "loss": 0.89035624, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.17016602, + "step": 2572, + "time_per_iteration": 2.8100666999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113844, + "balance_loss_mlp": 1.09724987, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06954374103744322, + "language_loss": 0.87645632, + "learning_rate": 0.0005323777293394854, + "loss": 0.88759476, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.16601562, + "step": 2573, + "time_per_iteration": 2.6342670917510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112174, + "balance_loss_mlp": 1.09544909, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.06551139751330846, + "language_loss": 0.82055044, + "learning_rate": 0.000532066834641184, + "loss": 0.83167219, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.1673584, + "step": 2574, + "time_per_iteration": 2.7459301948547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115153, + "balance_loss_mlp": 1.09861851, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.07271172156944823, + "language_loss": 0.85062492, + "learning_rate": 0.0005317559274934334, + "loss": 0.86177647, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.16540527, + "step": 2575, + "time_per_iteration": 2.79950213432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109887, + "balance_loss_mlp": 1.0929718, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.12491917898667039, + "language_loss": 0.80294836, + "learning_rate": 0.0005314450080169382, + "loss": 0.81404722, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.16931152, + "step": 2576, + "time_per_iteration": 2.646117687225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111519, + "balance_loss_mlp": 1.09459102, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.06948953090692808, + "language_loss": 0.80618382, + "learning_rate": 0.0005311340763324083, + "loss": 0.81729901, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.16931152, + "step": 2577, + "time_per_iteration": 2.637355327606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115745, + "balance_loss_mlp": 1.09885335, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.06343391975743103, + "language_loss": 0.82572562, + "learning_rate": 0.0005308231325605578, + "loss": 0.83688301, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.16906738, + "step": 2578, + "time_per_iteration": 2.7532670497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10721767, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.06763129936720796, + "language_loss": 0.76589197, + "learning_rate": 0.0005305121768221061, + "loss": 0.77713311, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.16906738, + "step": 2579, + "time_per_iteration": 3.099548816680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106106, + "balance_loss_mlp": 1.09718919, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.03611799224355641, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76144433, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.08935547, + "step": 2580, + "time_per_iteration": 4.822290658950806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112299, + "balance_loss_mlp": 1.0955143, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.07683784808208224, + "language_loss": 0.91874099, + "learning_rate": 0.0005298902299282984, + "loss": 0.92986393, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.16796875, + "step": 2581, + "time_per_iteration": 2.6493284702301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117923, + "balance_loss_mlp": 1.10141301, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.09118838704679054, + "language_loss": 0.84425116, + "learning_rate": 0.0005295792390144033, + "loss": 0.85543042, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.16516113, + "step": 2582, + "time_per_iteration": 2.8000099658966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121007, + "balance_loss_mlp": 1.1042583, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.08989559260345804, + "language_loss": 0.83660305, + "learning_rate": 0.0005292682366168294, + "loss": 0.84781313, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.16760254, + "step": 2583, + "time_per_iteration": 2.573913812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116101, + "balance_loss_mlp": 1.0993638, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.07863246165846992, + "language_loss": 0.79766655, + "learning_rate": 0.0005289572228563181, + "loss": 0.80882752, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.16748047, + "step": 2584, + "time_per_iteration": 2.807269811630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114321, + "balance_loss_mlp": 1.09676123, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.06809186764850061, + "language_loss": 0.8288846, + "learning_rate": 0.000528646197853616, + "loss": 0.84002781, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.17578125, + "step": 2585, + "time_per_iteration": 2.806168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114035, + "balance_loss_mlp": 1.09709597, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.06908816819532054, + "language_loss": 0.85582453, + "learning_rate": 0.0005283351617294735, + "loss": 0.86696494, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.16943359, + "step": 2586, + "time_per_iteration": 2.926912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_mlp": 1.02630937, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.01596603428611825, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77671409, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.08447266, + "step": 2587, + "time_per_iteration": 5.0390965938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107376, + "balance_loss_mlp": 1.08937573, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.06339397332392985, + "language_loss": 0.86461538, + "learning_rate": 0.0005277130565998916, + "loss": 0.87568915, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.18005371, + "step": 2588, + "time_per_iteration": 2.770092248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116474, + "balance_loss_mlp": 1.09942722, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.058229952595652015, + "language_loss": 0.81859887, + "learning_rate": 0.0005274019878359748, + "loss": 0.82976359, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.17053223, + "step": 2589, + "time_per_iteration": 2.7338075637817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114654, + "balance_loss_mlp": 1.09733331, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.09126406549336552, + "language_loss": 0.86714995, + "learning_rate": 0.0005270909084336628, + "loss": 0.87829649, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.17333984, + "step": 2590, + "time_per_iteration": 2.65108323097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116441, + "balance_loss_mlp": 1.09858298, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.1060624554819127, + "language_loss": 0.88702905, + "learning_rate": 0.0005267798185137276, + "loss": 0.89819348, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.17871094, + "step": 2591, + "time_per_iteration": 2.6553287506103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105559, + "balance_loss_mlp": 1.08758211, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.13093350294478928, + "language_loss": 0.88770413, + "learning_rate": 0.0005264687181969444, + "loss": 0.89875972, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.17980957, + "step": 2592, + "time_per_iteration": 2.7969043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110929, + "balance_loss_mlp": 1.0928092, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.07529154121690083, + "language_loss": 0.74930251, + "learning_rate": 0.0005261576076040937, + "loss": 0.76041174, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.18127441, + "step": 2593, + "time_per_iteration": 3.3571712970733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101158, + "balance_loss_mlp": 1.08368254, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.07032432999454871, + "language_loss": 0.83977568, + "learning_rate": 0.0005258464868559591, + "loss": 0.85078728, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.17492676, + "step": 2594, + "time_per_iteration": 2.691549301147461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102198, + "balance_loss_mlp": 1.08469868, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.06016242034808734, + "language_loss": 0.88749588, + "learning_rate": 0.0005255353560733284, + "loss": 0.89851785, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.17529297, + "step": 2595, + "time_per_iteration": 2.643775701522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074264, + "balance_loss_mlp": 1.0654906, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.03161132267250996, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76652908, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.08789062, + "step": 2596, + "time_per_iteration": 4.8261682987213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.08255887, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.06872371897226848, + "language_loss": 0.83470559, + "learning_rate": 0.0005249130648877492, + "loss": 0.84571064, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.17956543, + "step": 2597, + "time_per_iteration": 2.793973445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099762, + "balance_loss_mlp": 1.08096313, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.07739235171207769, + "language_loss": 0.84593171, + "learning_rate": 0.0005246019047263953, + "loss": 0.8569293, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.18798828, + "step": 2598, + "time_per_iteration": 2.5284597873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_mlp": 1.08447933, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.0766017052589062, + "language_loss": 0.82300264, + "learning_rate": 0.0005242907350137353, + "loss": 0.83403295, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.18554688, + "step": 2599, + "time_per_iteration": 2.57824969291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102331, + "balance_loss_mlp": 1.08466387, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.07109220242790512, + "language_loss": 0.78955519, + "learning_rate": 0.0005239795558705754, + "loss": 0.80057847, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.17675781, + "step": 2600, + "time_per_iteration": 2.735712766647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093225, + "balance_loss_mlp": 1.07491398, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.0850656909263446, + "language_loss": 0.89518678, + "learning_rate": 0.0005236683674177264, + "loss": 0.90611899, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.18310547, + "step": 2601, + "time_per_iteration": 2.7013046741485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101472, + "balance_loss_mlp": 1.08336401, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.06829559635091415, + "language_loss": 0.82179487, + "learning_rate": 0.0005233571697760021, + "loss": 0.83280951, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.18103027, + "step": 2602, + "time_per_iteration": 2.902503490447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101813, + "balance_loss_mlp": 1.08420539, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.10152220944898022, + "language_loss": 0.82961535, + "learning_rate": 0.0005230459630662203, + "loss": 0.84063351, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.17626953, + "step": 2603, + "time_per_iteration": 2.966848134994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.09103274, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.07939636618021073, + "language_loss": 0.8145076, + "learning_rate": 0.0005227347474092022, + "loss": 0.82559389, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.17602539, + "step": 2604, + "time_per_iteration": 2.76577091217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107422, + "balance_loss_mlp": 1.08948135, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.06357584490296206, + "language_loss": 0.82990885, + "learning_rate": 0.0005224235229257724, + "loss": 0.84098309, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.17956543, + "step": 2605, + "time_per_iteration": 2.798074245452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108194, + "balance_loss_mlp": 1.09092093, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.059877769950401664, + "language_loss": 0.86506116, + "learning_rate": 0.0005221122897367589, + "loss": 0.8761431, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.17285156, + "step": 2606, + "time_per_iteration": 2.8442416191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120744, + "balance_loss_mlp": 1.10386384, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.08858636737693353, + "language_loss": 0.81257951, + "learning_rate": 0.0005218010479629932, + "loss": 0.82378697, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.16882324, + "step": 2607, + "time_per_iteration": 2.720196485519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112059, + "balance_loss_mlp": 1.09503603, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.09219088613115281, + "language_loss": 0.82021785, + "learning_rate": 0.0005214897977253102, + "loss": 0.83133841, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.17041016, + "step": 2608, + "time_per_iteration": 2.6824939250946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104466, + "balance_loss_mlp": 1.08703792, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.05892482680876805, + "language_loss": 0.84221715, + "learning_rate": 0.0005211785391445473, + "loss": 0.85326183, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.17456055, + "step": 2609, + "time_per_iteration": 2.72525954246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.08809578, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.07489132465153774, + "language_loss": 0.79042387, + "learning_rate": 0.0005208672723415467, + "loss": 0.80148035, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.17553711, + "step": 2610, + "time_per_iteration": 2.8028247356414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110106, + "balance_loss_mlp": 1.08385801, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.08294073768606391, + "language_loss": 0.7915107, + "learning_rate": 0.0005205559974371525, + "loss": 0.80252123, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.17211914, + "step": 2611, + "time_per_iteration": 2.7850143909454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094923, + "balance_loss_mlp": 1.07810235, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.07295315460395477, + "language_loss": 0.82193494, + "learning_rate": 0.0005202447145522123, + "loss": 0.83288413, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.16821289, + "step": 2612, + "time_per_iteration": 2.700307607650757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090548, + "balance_loss_mlp": 1.07344127, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.0792727031944949, + "language_loss": 0.79256612, + "learning_rate": 0.0005199334238075769, + "loss": 0.80347157, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.17126465, + "step": 2613, + "time_per_iteration": 2.6153087615966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089787, + "balance_loss_mlp": 1.07271576, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.08033639738386796, + "language_loss": 0.91661727, + "learning_rate": 0.0005196221253241, + "loss": 0.92751515, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.17089844, + "step": 2614, + "time_per_iteration": 2.6069750785827637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088826, + "balance_loss_mlp": 1.07155263, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.07969948054344475, + "language_loss": 0.82871294, + "learning_rate": 0.0005193108192226383, + "loss": 0.83960116, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.17272949, + "step": 2615, + "time_per_iteration": 2.8156328201293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084024, + "balance_loss_mlp": 1.06673825, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.06296322155163143, + "language_loss": 0.86797768, + "learning_rate": 0.000518999505624052, + "loss": 0.87881792, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.1730957, + "step": 2616, + "time_per_iteration": 2.7152223587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080227, + "balance_loss_mlp": 1.06292999, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.05958638296552923, + "language_loss": 0.83317488, + "learning_rate": 0.000518688184649203, + "loss": 0.84397715, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.17297363, + "step": 2617, + "time_per_iteration": 2.8284754753112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_mlp": 1.06272697, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.07368279711977406, + "language_loss": 0.83787394, + "learning_rate": 0.0005183768564189577, + "loss": 0.84867823, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.17724609, + "step": 2618, + "time_per_iteration": 2.591064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.06613898, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.08850035073541652, + "language_loss": 0.81363833, + "learning_rate": 0.0005180655210541838, + "loss": 0.82447004, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.17041016, + "step": 2619, + "time_per_iteration": 2.5832765102386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086664, + "balance_loss_mlp": 1.06910443, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.09602250816000424, + "language_loss": 0.8361724, + "learning_rate": 0.0005177541786757527, + "loss": 0.8470391, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.17565918, + "step": 2620, + "time_per_iteration": 2.8272600173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081312, + "balance_loss_mlp": 1.0633707, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.08634316495635827, + "language_loss": 0.82817882, + "learning_rate": 0.000517442829404538, + "loss": 0.838992, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.17956543, + "step": 2621, + "time_per_iteration": 3.0231099128723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108588, + "balance_loss_mlp": 1.06736684, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.07086048560872778, + "language_loss": 0.87109387, + "learning_rate": 0.0005171314733614166, + "loss": 0.88195264, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.18505859, + "step": 2622, + "time_per_iteration": 2.924490213394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092184, + "balance_loss_mlp": 1.07450485, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.09670552238526126, + "language_loss": 0.78441215, + "learning_rate": 0.0005168201106672671, + "loss": 0.79533398, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.17700195, + "step": 2623, + "time_per_iteration": 2.7627530097961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081433, + "balance_loss_mlp": 1.06351626, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.07080566946451637, + "language_loss": 0.8469494, + "learning_rate": 0.0005165087414429717, + "loss": 0.85776377, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.17932129, + "step": 2624, + "time_per_iteration": 2.6216189861297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078588, + "balance_loss_mlp": 1.06013489, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.07518378231968396, + "language_loss": 0.83469629, + "learning_rate": 0.0005161973658094144, + "loss": 0.84548217, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.18444824, + "step": 2625, + "time_per_iteration": 2.686030864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077122, + "balance_loss_mlp": 1.05919266, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.07052814404413787, + "language_loss": 0.82367003, + "learning_rate": 0.000515885983887482, + "loss": 0.83444118, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.17944336, + "step": 2626, + "time_per_iteration": 2.742265224456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073389, + "balance_loss_mlp": 1.05478024, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.0761715011076948, + "language_loss": 0.84318763, + "learning_rate": 0.0005155745957980636, + "loss": 0.85392147, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.18615723, + "step": 2627, + "time_per_iteration": 2.6049954891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074823, + "balance_loss_mlp": 1.05586839, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.07614118511738227, + "language_loss": 0.88045084, + "learning_rate": 0.000515263201662051, + "loss": 0.89119911, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.18945312, + "step": 2628, + "time_per_iteration": 2.7101621627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084597, + "balance_loss_mlp": 1.06617892, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.07415998964954142, + "language_loss": 0.82280606, + "learning_rate": 0.0005149518016003378, + "loss": 0.83365202, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.1842041, + "step": 2629, + "time_per_iteration": 3.194669723510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080493, + "balance_loss_mlp": 1.06227767, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.07616905133259881, + "language_loss": 0.8214519, + "learning_rate": 0.0005146403957338206, + "loss": 0.83225679, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.18212891, + "step": 2630, + "time_per_iteration": 2.6495327949523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092259, + "balance_loss_mlp": 1.07468796, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.06296513552488332, + "language_loss": 0.81962919, + "learning_rate": 0.0005143289841833975, + "loss": 0.8305518, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.17578125, + "step": 2631, + "time_per_iteration": 2.8716421127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.07512259, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.0779936416436138, + "language_loss": 0.82076275, + "learning_rate": 0.0005140175670699696, + "loss": 0.83168757, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.17382812, + "step": 2632, + "time_per_iteration": 2.6159043312072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.07069623, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.053505876641590386, + "language_loss": 0.82692468, + "learning_rate": 0.0005137061445144395, + "loss": 0.83781052, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.17895508, + "step": 2633, + "time_per_iteration": 2.9435369968414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102566, + "balance_loss_mlp": 1.08499455, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.07429237358898076, + "language_loss": 0.86728698, + "learning_rate": 0.000513394716637712, + "loss": 0.87831259, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.17590332, + "step": 2634, + "time_per_iteration": 2.785621404647827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_mlp": 1.02165747, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.025420781551357425, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80223238, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.09863281, + "step": 2635, + "time_per_iteration": 4.87060809135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103723, + "balance_loss_mlp": 1.08666396, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.0808554701524121, + "language_loss": 0.8102541, + "learning_rate": 0.0005127718454042958, + "loss": 0.82129133, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.1706543, + "step": 2636, + "time_per_iteration": 2.8784031867980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102685, + "balance_loss_mlp": 1.08523273, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.07186288747403746, + "language_loss": 0.84171808, + "learning_rate": 0.0005124604022894269, + "loss": 0.85274494, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.17468262, + "step": 2637, + "time_per_iteration": 2.9495620727539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018568, + "balance_loss_mlp": 1.00903082, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.013467544944548519, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78206789, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.09521484, + "step": 2638, + "time_per_iteration": 4.841961145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100977, + "balance_loss_mlp": 1.08402538, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.0754060533252176, + "language_loss": 0.83016658, + "learning_rate": 0.0005118375016679325, + "loss": 0.84117633, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.16967773, + "step": 2639, + "time_per_iteration": 2.7659313678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094497, + "balance_loss_mlp": 1.07784295, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.08036414838520123, + "language_loss": 0.80592823, + "learning_rate": 0.0005115260444031382, + "loss": 0.81687325, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.16662598, + "step": 2640, + "time_per_iteration": 2.6009633541107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012329, + "balance_loss_mlp": 1.00350785, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011999730841431432, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79744148, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.08837891, + "step": 2641, + "time_per_iteration": 4.949390411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097313, + "balance_loss_mlp": 1.08012342, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.07347538330964974, + "language_loss": 0.87067777, + "learning_rate": 0.0005109031165700483, + "loss": 0.88165087, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.17211914, + "step": 2642, + "time_per_iteration": 2.571359634399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089013, + "balance_loss_mlp": 1.07212138, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.07982577059913512, + "language_loss": 0.8353101, + "learning_rate": 0.0005105916462435945, + "loss": 0.84620023, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.16894531, + "step": 2643, + "time_per_iteration": 2.853332996368408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090538, + "balance_loss_mlp": 1.07358634, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.06767023016464803, + "language_loss": 0.85332114, + "learning_rate": 0.0005102801718050989, + "loss": 0.86422646, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.16967773, + "step": 2644, + "time_per_iteration": 2.71907377243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085318, + "balance_loss_mlp": 1.06869972, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.08980112743883228, + "language_loss": 0.89031243, + "learning_rate": 0.0005099686933754867, + "loss": 0.9011656, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.16625977, + "step": 2645, + "time_per_iteration": 2.759768009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108757, + "balance_loss_mlp": 1.07075, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.07519563415405216, + "language_loss": 0.84095073, + "learning_rate": 0.0005096572110756845, + "loss": 0.85182643, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.16833496, + "step": 2646, + "time_per_iteration": 2.742478132247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.06656277, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.06876057003625125, + "language_loss": 0.85465425, + "learning_rate": 0.0005093457250266205, + "loss": 0.86549312, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.17333984, + "step": 2647, + "time_per_iteration": 2.762909173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091086, + "balance_loss_mlp": 1.073717, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.1044353617825215, + "language_loss": 0.8341682, + "learning_rate": 0.000509034235349224, + "loss": 0.84507906, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.1739502, + "step": 2648, + "time_per_iteration": 2.726165533065796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109791, + "balance_loss_mlp": 1.08109021, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.07313436933557896, + "language_loss": 0.81423604, + "learning_rate": 0.0005087227421644266, + "loss": 0.8252151, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.16821289, + "step": 2649, + "time_per_iteration": 2.753390312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108015, + "balance_loss_mlp": 1.09102726, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.0718220857310726, + "language_loss": 0.85905892, + "learning_rate": 0.0005084112455931602, + "loss": 0.87013906, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.17004395, + "step": 2650, + "time_per_iteration": 2.5981361865997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116436, + "balance_loss_mlp": 1.0991627, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.0710139819724768, + "language_loss": 0.84867871, + "learning_rate": 0.0005080997457563586, + "loss": 0.85984302, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.17297363, + "step": 2651, + "time_per_iteration": 2.5604488849639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125127, + "balance_loss_mlp": 1.10802007, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.08475984872157578, + "language_loss": 0.78772122, + "learning_rate": 0.0005077882427749569, + "loss": 0.79897249, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.17114258, + "step": 2652, + "time_per_iteration": 2.5588836669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137152, + "balance_loss_mlp": 1.12011659, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.0878101507805391, + "language_loss": 0.84672785, + "learning_rate": 0.0005074767367698913, + "loss": 0.85809934, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.17041016, + "step": 2653, + "time_per_iteration": 2.7424826622009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113443, + "balance_loss_mlp": 1.11758542, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.10879937034210539, + "language_loss": 0.83426005, + "learning_rate": 0.0005071652278620988, + "loss": 0.8456043, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.16845703, + "step": 2654, + "time_per_iteration": 3.09969162940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124784, + "balance_loss_mlp": 1.10785651, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.10475987580925356, + "language_loss": 0.83118153, + "learning_rate": 0.0005068537161725186, + "loss": 0.8424294, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.16943359, + "step": 2655, + "time_per_iteration": 2.82289719581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116916, + "balance_loss_mlp": 1.09999979, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.07925993280329827, + "language_loss": 0.84691739, + "learning_rate": 0.0005065422018220893, + "loss": 0.85808647, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.16931152, + "step": 2656, + "time_per_iteration": 2.8794078826904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112009, + "balance_loss_mlp": 1.09535527, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.07178639525503218, + "language_loss": 0.80310833, + "learning_rate": 0.0005062306849317521, + "loss": 0.81422836, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.16662598, + "step": 2657, + "time_per_iteration": 2.814025402069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110163, + "balance_loss_mlp": 1.09374762, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.09425319021973573, + "language_loss": 0.83069956, + "learning_rate": 0.0005059191656224487, + "loss": 0.84180123, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.16418457, + "step": 2658, + "time_per_iteration": 2.7602522373199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110977, + "balance_loss_mlp": 1.09316397, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.10010645818095278, + "language_loss": 0.89003229, + "learning_rate": 0.0005056076440151212, + "loss": 0.90113008, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.1661377, + "step": 2659, + "time_per_iteration": 2.7027831077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071712, + "balance_loss_mlp": 1.06413066, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.039772151853185514, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77359831, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.07568359, + "step": 2660, + "time_per_iteration": 4.856590032577515 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115167, + "balance_loss_mlp": 1.09887075, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06725256479668422, + "language_loss": 0.86826003, + "learning_rate": 0.0005049845943901691, + "loss": 0.87941164, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.16296387, + "step": 2661, + "time_per_iteration": 2.8570423126220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122122, + "balance_loss_mlp": 1.10631514, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.0894536064907193, + "language_loss": 0.8667441, + "learning_rate": 0.0005046730666144338, + "loss": 0.87796533, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.15795898, + "step": 2662, + "time_per_iteration": 2.883822202682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119148, + "balance_loss_mlp": 1.10315049, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.06658438993973123, + "language_loss": 0.87964702, + "learning_rate": 0.0005043615370244532, + "loss": 0.8908385, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.15991211, + "step": 2663, + "time_per_iteration": 3.388521671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_mlp": 1.02103686, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.01281563800895277, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79272962, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.07519531, + "step": 2664, + "time_per_iteration": 4.6337666511535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119325, + "balance_loss_mlp": 1.10361338, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.058968241204554794, + "language_loss": 0.85154796, + "learning_rate": 0.0005037384728855425, + "loss": 0.86274123, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.15698242, + "step": 2665, + "time_per_iteration": 2.8316938877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116454, + "balance_loss_mlp": 1.10032547, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.07313815870373463, + "language_loss": 0.8427707, + "learning_rate": 0.0005034269385785075, + "loss": 0.85393524, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.16125488, + "step": 2666, + "time_per_iteration": 2.705953359603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119966, + "balance_loss_mlp": 1.10405147, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.09131160106886373, + "language_loss": 0.84140623, + "learning_rate": 0.0005031154029410168, + "loss": 0.85260594, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.15905762, + "step": 2667, + "time_per_iteration": 2.5483505725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121112, + "balance_loss_mlp": 1.10497081, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.07350853386407429, + "language_loss": 0.86393219, + "learning_rate": 0.0005028038660940197, + "loss": 0.87514335, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.16137695, + "step": 2668, + "time_per_iteration": 2.5729174613952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117082, + "balance_loss_mlp": 1.10103667, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.06973928207648594, + "language_loss": 0.84257567, + "learning_rate": 0.0005024923281584648, + "loss": 0.85374653, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.16040039, + "step": 2669, + "time_per_iteration": 2.695422410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112964, + "balance_loss_mlp": 1.11378479, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.07121106891997668, + "language_loss": 0.82480651, + "learning_rate": 0.0005021807892553026, + "loss": 0.8361029, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.15844727, + "step": 2670, + "time_per_iteration": 2.751401662826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129611, + "balance_loss_mlp": 1.11330318, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.07354407823714339, + "language_loss": 0.84572917, + "learning_rate": 0.0005018692495054828, + "loss": 0.85702527, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.16308594, + "step": 2671, + "time_per_iteration": 2.757593870162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123606, + "balance_loss_mlp": 1.10785806, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.06661441717787603, + "language_loss": 0.80650961, + "learning_rate": 0.0005015577090299561, + "loss": 0.81774569, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.15734863, + "step": 2672, + "time_per_iteration": 2.693725347518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_mlp": 1.09435153, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.07298787487316409, + "language_loss": 0.86515582, + "learning_rate": 0.0005012461679496729, + "loss": 0.87626314, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.16381836, + "step": 2673, + "time_per_iteration": 2.6318869590759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111417, + "balance_loss_mlp": 1.09533608, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.07740296935823926, + "language_loss": 0.87230647, + "learning_rate": 0.0005009346263855848, + "loss": 0.88342059, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.1607666, + "step": 2674, + "time_per_iteration": 2.6561901569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108221, + "balance_loss_mlp": 1.09159088, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.0608007463380774, + "language_loss": 0.83338469, + "learning_rate": 0.0005006230844586422, + "loss": 0.84446692, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.16638184, + "step": 2675, + "time_per_iteration": 2.7956371307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.09186745, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.06956599587127472, + "language_loss": 0.78915107, + "learning_rate": 0.0005003115422897968, + "loss": 0.80023432, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.16467285, + "step": 2676, + "time_per_iteration": 2.8026392459869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098426, + "balance_loss_mlp": 1.08178461, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.06380905094740742, + "language_loss": 0.87044096, + "learning_rate": 0.0005, + "loss": 0.8814252, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.16650391, + "step": 2677, + "time_per_iteration": 2.6397616863250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096356, + "balance_loss_mlp": 1.07940435, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.06972488542821374, + "language_loss": 0.79243249, + "learning_rate": 0.0004996884577102033, + "loss": 0.80339611, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.16967773, + "step": 2678, + "time_per_iteration": 3.1194515228271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109136, + "balance_loss_mlp": 1.07438445, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.07627965924369287, + "language_loss": 0.84695083, + "learning_rate": 0.000499376915541358, + "loss": 0.85786444, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.16992188, + "step": 2679, + "time_per_iteration": 2.7068095207214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089943, + "balance_loss_mlp": 1.07359934, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.06818096885322372, + "language_loss": 0.81243503, + "learning_rate": 0.0004990653736144155, + "loss": 0.8233344, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.16345215, + "step": 2680, + "time_per_iteration": 2.8939812183380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108849, + "balance_loss_mlp": 1.07127619, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.06989870799279192, + "language_loss": 0.85872787, + "learning_rate": 0.0004987538320503271, + "loss": 0.86961281, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.17236328, + "step": 2681, + "time_per_iteration": 2.5216612815856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082053, + "balance_loss_mlp": 1.06468463, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.08598338754099338, + "language_loss": 0.82912159, + "learning_rate": 0.0004984422909700442, + "loss": 0.8399421, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.17382812, + "step": 2682, + "time_per_iteration": 2.665601968765259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081809, + "balance_loss_mlp": 1.06371331, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.06868623883512981, + "language_loss": 0.8358953, + "learning_rate": 0.0004981307504945173, + "loss": 0.84671342, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.1809082, + "step": 2683, + "time_per_iteration": 2.744506597518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084499, + "balance_loss_mlp": 1.06714213, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.07139371766694287, + "language_loss": 0.89118385, + "learning_rate": 0.0004978192107446976, + "loss": 0.9020288, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.17370605, + "step": 2684, + "time_per_iteration": 2.840625762939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107857, + "balance_loss_mlp": 1.06075978, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.07781566774681065, + "language_loss": 0.87333429, + "learning_rate": 0.0004975076718415353, + "loss": 0.88411999, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.17810059, + "step": 2685, + "time_per_iteration": 2.6297128200531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076856, + "balance_loss_mlp": 1.05923653, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.07734898237902697, + "language_loss": 0.90289825, + "learning_rate": 0.0004971961339059806, + "loss": 0.91366684, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.17626953, + "step": 2686, + "time_per_iteration": 2.5235214233398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079451, + "balance_loss_mlp": 1.06149805, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.08309998288602231, + "language_loss": 0.84119761, + "learning_rate": 0.0004968845970589832, + "loss": 0.85199213, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.17956543, + "step": 2687, + "time_per_iteration": 2.6999969482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085317, + "balance_loss_mlp": 1.06760216, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.0817039791962864, + "language_loss": 0.84468675, + "learning_rate": 0.0004965730614214926, + "loss": 0.85553992, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.17724609, + "step": 2688, + "time_per_iteration": 2.658827066421509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078556, + "balance_loss_mlp": 1.06094825, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.07334441433702203, + "language_loss": 0.85342443, + "learning_rate": 0.0004962615271144576, + "loss": 0.86421001, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.17626953, + "step": 2689, + "time_per_iteration": 2.50878643989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.06994319, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.12467871415324963, + "language_loss": 0.82284343, + "learning_rate": 0.0004959499942588264, + "loss": 0.83371305, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.17028809, + "step": 2690, + "time_per_iteration": 2.9249496459960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_mlp": 1.03822827, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.03199266467607697, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79247075, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.08837891, + "step": 2691, + "time_per_iteration": 4.82594108581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090274, + "balance_loss_mlp": 1.07309616, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.07423408614425925, + "language_loss": 0.85369182, + "learning_rate": 0.0004953269333855661, + "loss": 0.86459452, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.17175293, + "step": 2692, + "time_per_iteration": 2.777863025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093446, + "balance_loss_mlp": 1.07593369, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.08941680356551608, + "language_loss": 0.84251738, + "learning_rate": 0.0004950154056098309, + "loss": 0.85345179, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.17529297, + "step": 2693, + "time_per_iteration": 2.7481398582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097937, + "balance_loss_mlp": 1.08010364, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.07099923409869693, + "language_loss": 0.84394872, + "learning_rate": 0.0004947038797692867, + "loss": 0.85492814, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.1784668, + "step": 2694, + "time_per_iteration": 2.8453128337860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113818, + "balance_loss_mlp": 1.096771, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.06154827687851128, + "language_loss": 0.77520609, + "learning_rate": 0.0004943923559848789, + "loss": 0.78634429, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.1706543, + "step": 2695, + "time_per_iteration": 2.841853141784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124321, + "balance_loss_mlp": 1.10654736, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.06645104429405103, + "language_loss": 0.90406942, + "learning_rate": 0.0004940808343775515, + "loss": 0.91531265, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.17773438, + "step": 2696, + "time_per_iteration": 2.749504327774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118087, + "balance_loss_mlp": 1.10027719, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.07841169466401897, + "language_loss": 0.82063687, + "learning_rate": 0.0004937693150682479, + "loss": 0.83181769, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.17810059, + "step": 2697, + "time_per_iteration": 2.5522847175598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118895, + "balance_loss_mlp": 1.10168159, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.07394243959698338, + "language_loss": 0.76709116, + "learning_rate": 0.0004934577981779107, + "loss": 0.77828008, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.17224121, + "step": 2698, + "time_per_iteration": 2.72316312789917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115498, + "balance_loss_mlp": 1.09879637, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.0912267088784467, + "language_loss": 0.8119272, + "learning_rate": 0.0004931462838274817, + "loss": 0.82308215, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.16711426, + "step": 2699, + "time_per_iteration": 2.8209919929504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107121, + "balance_loss_mlp": 1.08981156, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.10066489144579434, + "language_loss": 0.83903617, + "learning_rate": 0.0004928347721379011, + "loss": 0.85010743, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.17333984, + "step": 2700, + "time_per_iteration": 2.679414749145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098221, + "balance_loss_mlp": 1.08088803, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.06308374672073903, + "language_loss": 0.82055807, + "learning_rate": 0.0004925232632301089, + "loss": 0.83154029, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.17346191, + "step": 2701, + "time_per_iteration": 2.5568413734436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086175, + "balance_loss_mlp": 1.06934261, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.07257701027520803, + "language_loss": 0.79591668, + "learning_rate": 0.0004922117572250431, + "loss": 0.80677843, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.16845703, + "step": 2702, + "time_per_iteration": 2.6907496452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085203, + "balance_loss_mlp": 1.06819224, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.08909916825126464, + "language_loss": 0.80501723, + "learning_rate": 0.0004919002542436414, + "loss": 0.81586921, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.17016602, + "step": 2703, + "time_per_iteration": 2.8154964447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.07078612, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.07574293506029897, + "language_loss": 0.8094272, + "learning_rate": 0.0004915887544068399, + "loss": 0.82030636, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.17138672, + "step": 2704, + "time_per_iteration": 2.6723296642303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080297, + "balance_loss_mlp": 1.06322646, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.08223729103851085, + "language_loss": 0.78410661, + "learning_rate": 0.0004912772578355736, + "loss": 0.79490954, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.1706543, + "step": 2705, + "time_per_iteration": 2.904359817504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080431, + "balance_loss_mlp": 1.06288326, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.0867272148609526, + "language_loss": 0.82534099, + "learning_rate": 0.000490965764650776, + "loss": 0.83614528, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.17553711, + "step": 2706, + "time_per_iteration": 2.893965005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082267, + "balance_loss_mlp": 1.06508923, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.08899008608425168, + "language_loss": 0.82646501, + "learning_rate": 0.0004906542749733798, + "loss": 0.83728766, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.171875, + "step": 2707, + "time_per_iteration": 3.642857313156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081324, + "balance_loss_mlp": 1.06468248, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.06383765372803735, + "language_loss": 0.85145414, + "learning_rate": 0.0004903427889243156, + "loss": 0.86226737, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.16650391, + "step": 2708, + "time_per_iteration": 2.8898375034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091262, + "balance_loss_mlp": 1.074036, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.07905445780966364, + "language_loss": 0.85149866, + "learning_rate": 0.0004900313066245134, + "loss": 0.86241126, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.17236328, + "step": 2709, + "time_per_iteration": 2.65574049949646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088104, + "balance_loss_mlp": 1.07130718, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.07812284997006956, + "language_loss": 0.80880928, + "learning_rate": 0.0004897198281949012, + "loss": 0.81969029, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.16796875, + "step": 2710, + "time_per_iteration": 2.672153949737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103259, + "balance_loss_mlp": 1.08604503, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.07691692452987973, + "language_loss": 0.77799213, + "learning_rate": 0.0004894083537564057, + "loss": 0.78902471, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.17236328, + "step": 2711, + "time_per_iteration": 2.7532706260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104375, + "balance_loss_mlp": 1.08732796, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.07306223578012608, + "language_loss": 0.80945504, + "learning_rate": 0.0004890968834299519, + "loss": 0.82049876, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.1706543, + "step": 2712, + "time_per_iteration": 2.7456612586975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113403, + "balance_loss_mlp": 1.09663057, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.06414784694166918, + "language_loss": 0.7858941, + "learning_rate": 0.0004887854173364633, + "loss": 0.79702818, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.16784668, + "step": 2713, + "time_per_iteration": 2.731410503387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116912, + "balance_loss_mlp": 1.10033011, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.062429546921528134, + "language_loss": 0.8127901, + "learning_rate": 0.0004884739555968617, + "loss": 0.82395923, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.16589355, + "step": 2714, + "time_per_iteration": 2.8288521766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024153, + "balance_loss_mlp": 1.01604629, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.017358883808072843, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80001205, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.08105469, + "step": 2715, + "time_per_iteration": 5.007716417312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124661, + "balance_loss_mlp": 1.10728037, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.06973573346877397, + "language_loss": 0.86611319, + "learning_rate": 0.0004878510456629992, + "loss": 0.87735981, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.17407227, + "step": 2716, + "time_per_iteration": 3.006253957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131765, + "balance_loss_mlp": 1.11461031, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.07218030120275976, + "language_loss": 0.85169446, + "learning_rate": 0.00048753959771057314, + "loss": 0.86301208, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.17175293, + "step": 2717, + "time_per_iteration": 2.6976563930511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121586, + "balance_loss_mlp": 1.10383558, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.07681806180198643, + "language_loss": 0.82615161, + "learning_rate": 0.0004872281545957044, + "loss": 0.83736753, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.1776123, + "step": 2718, + "time_per_iteration": 2.8015332221984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117931, + "balance_loss_mlp": 1.10027635, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.058351443586734386, + "language_loss": 0.85597366, + "learning_rate": 0.0004869167164393055, + "loss": 0.86715293, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.17675781, + "step": 2719, + "time_per_iteration": 2.9708495140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116486, + "balance_loss_mlp": 1.09911728, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.06620613765458017, + "language_loss": 0.88742125, + "learning_rate": 0.00048660528336228793, + "loss": 0.89858615, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.17382812, + "step": 2720, + "time_per_iteration": 2.7995879650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106216, + "balance_loss_mlp": 1.08846569, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.06179859794056996, + "language_loss": 0.90307331, + "learning_rate": 0.0004862938554855606, + "loss": 0.91413546, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.1776123, + "step": 2721, + "time_per_iteration": 2.8321540355682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104126, + "balance_loss_mlp": 1.08690071, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.07085532730134622, + "language_loss": 0.85930234, + "learning_rate": 0.0004859824329300304, + "loss": 0.87034363, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.17248535, + "step": 2722, + "time_per_iteration": 2.6302812099456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110407, + "balance_loss_mlp": 1.08649826, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.07263306317055565, + "language_loss": 0.83477378, + "learning_rate": 0.00048567101581660244, + "loss": 0.84581447, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.17590332, + "step": 2723, + "time_per_iteration": 2.68910813331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109903, + "balance_loss_mlp": 1.08181643, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.11439626446879424, + "language_loss": 0.87057537, + "learning_rate": 0.00048535960426617956, + "loss": 0.88156569, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.17236328, + "step": 2724, + "time_per_iteration": 2.622817039489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.07238674, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.061793488209652164, + "language_loss": 0.8146565, + "learning_rate": 0.0004850481983996621, + "loss": 0.8255589, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.17871094, + "step": 2725, + "time_per_iteration": 2.7661449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.07968855, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.1002744758401102, + "language_loss": 0.87726384, + "learning_rate": 0.0004847367983379492, + "loss": 0.8882367, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.17602539, + "step": 2726, + "time_per_iteration": 2.501094341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096174, + "balance_loss_mlp": 1.0795207, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.06877444759134967, + "language_loss": 0.78732175, + "learning_rate": 0.00048442540420193643, + "loss": 0.79828346, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.16662598, + "step": 2727, + "time_per_iteration": 2.9280529022216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091328, + "balance_loss_mlp": 1.07391191, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.07855483173762376, + "language_loss": 0.79334521, + "learning_rate": 0.0004841140161125182, + "loss": 0.80425853, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.17431641, + "step": 2728, + "time_per_iteration": 3.626858711242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093412, + "balance_loss_mlp": 1.07654381, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.08285412332857332, + "language_loss": 0.8463819, + "learning_rate": 0.0004838026341905857, + "loss": 0.85731602, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.16870117, + "step": 2729, + "time_per_iteration": 2.7793312072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088713, + "balance_loss_mlp": 1.07182097, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.07499858641848273, + "language_loss": 0.85196304, + "learning_rate": 0.00048349125855702844, + "loss": 0.86285013, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.16906738, + "step": 2730, + "time_per_iteration": 2.8079419136047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092888, + "balance_loss_mlp": 1.07605541, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.07740216541040414, + "language_loss": 0.81396556, + "learning_rate": 0.00048317988933273287, + "loss": 0.82489449, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.16845703, + "step": 2731, + "time_per_iteration": 2.772430419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084718, + "balance_loss_mlp": 1.06807661, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.18745226220584338, + "language_loss": 0.82080007, + "learning_rate": 0.00048286852663858367, + "loss": 0.83164728, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.16650391, + "step": 2732, + "time_per_iteration": 2.9268972873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087343, + "balance_loss_mlp": 1.07036781, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.08325512934533874, + "language_loss": 0.8380754, + "learning_rate": 0.000482557170595462, + "loss": 0.84894884, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.16992188, + "step": 2733, + "time_per_iteration": 2.8951096534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093841, + "balance_loss_mlp": 1.07677019, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.08900957978988387, + "language_loss": 0.87469298, + "learning_rate": 0.0004822458213242475, + "loss": 0.88563132, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.17089844, + "step": 2734, + "time_per_iteration": 2.5620529651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110054, + "balance_loss_mlp": 1.09249437, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.0633406501514696, + "language_loss": 0.85937345, + "learning_rate": 0.00048193447894581627, + "loss": 0.87047398, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.17565918, + "step": 2735, + "time_per_iteration": 3.103132486343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_mlp": 1.10083008, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.0756952830822362, + "language_loss": 0.87890029, + "learning_rate": 0.00048162314358104243, + "loss": 0.89008415, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.17565918, + "step": 2736, + "time_per_iteration": 2.6416001319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117726, + "balance_loss_mlp": 1.10027409, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.09251963370546762, + "language_loss": 0.83179659, + "learning_rate": 0.0004813118153507969, + "loss": 0.84297383, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.17468262, + "step": 2737, + "time_per_iteration": 2.7370142936706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078212, + "balance_loss_mlp": 1.0679127, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.03576440897911325, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83525336, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.10302734, + "step": 2738, + "time_per_iteration": 4.797177076339722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110775, + "balance_loss_mlp": 1.08933258, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.07588810399495584, + "language_loss": 0.83266842, + "learning_rate": 0.00048068918077736163, + "loss": 0.84374589, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.18408203, + "step": 2739, + "time_per_iteration": 3.2253060340881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109875, + "balance_loss_mlp": 1.0805707, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.07650809384335877, + "language_loss": 0.81149924, + "learning_rate": 0.0004803778746759001, + "loss": 0.82248676, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.18188477, + "step": 2740, + "time_per_iteration": 2.917982578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.07380056, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.08493152657291815, + "language_loss": 0.81563872, + "learning_rate": 0.00048006657619242317, + "loss": 0.82655203, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.17553711, + "step": 2741, + "time_per_iteration": 2.6491029262542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_mlp": 1.0661335, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.09642753382189671, + "language_loss": 0.78573406, + "learning_rate": 0.00047975528544778775, + "loss": 0.79657394, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.17858887, + "step": 2742, + "time_per_iteration": 2.6600565910339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080617, + "balance_loss_mlp": 1.06256926, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.07268225763303592, + "language_loss": 0.88256997, + "learning_rate": 0.00047944400256284754, + "loss": 0.89337611, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.18041992, + "step": 2743, + "time_per_iteration": 2.7662084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108031, + "balance_loss_mlp": 1.06228542, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.07011617815169531, + "language_loss": 0.79666251, + "learning_rate": 0.0004791327276584532, + "loss": 0.80746561, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.18041992, + "step": 2744, + "time_per_iteration": 2.835545301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075943, + "balance_loss_mlp": 1.05737054, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.08121623581547996, + "language_loss": 0.80470204, + "learning_rate": 0.00047882146085545264, + "loss": 0.81546152, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.18566895, + "step": 2745, + "time_per_iteration": 2.690206289291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_mlp": 1.02781987, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.02647915133994321, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76439977, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.09765625, + "step": 2746, + "time_per_iteration": 5.020122766494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074184, + "balance_loss_mlp": 1.05564749, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.0832805570330896, + "language_loss": 0.79321563, + "learning_rate": 0.00047819895203700684, + "loss": 0.80395758, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.18530273, + "step": 2747, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_mlp": 1.02084875, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.025219008400043496, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76542532, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.09228516, + "step": 2748, + "time_per_iteration": 4.670547246932983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.04841685, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.08023961077007181, + "language_loss": 0.88480437, + "learning_rate": 0.0004775764770742277, + "loss": 0.89546895, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.18041992, + "step": 2749, + "time_per_iteration": 2.8597028255462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074765, + "balance_loss_mlp": 1.05651426, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.0872100074417497, + "language_loss": 0.86519742, + "learning_rate": 0.00047726525259079777, + "loss": 0.87594503, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.18237305, + "step": 2750, + "time_per_iteration": 2.7900798320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080608, + "balance_loss_mlp": 1.06233358, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.10808949355702925, + "language_loss": 0.88474864, + "learning_rate": 0.0004769540369337798, + "loss": 0.89555472, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.18261719, + "step": 2751, + "time_per_iteration": 2.7448270320892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083505, + "balance_loss_mlp": 1.0650394, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06879132043127602, + "language_loss": 0.85886008, + "learning_rate": 0.00047664283022399794, + "loss": 0.86969519, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.18469238, + "step": 2752, + "time_per_iteration": 2.8719866275787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.06261468, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.0740043611556158, + "language_loss": 0.81022358, + "learning_rate": 0.00047633163258227376, + "loss": 0.82102704, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.17736816, + "step": 2753, + "time_per_iteration": 2.904007911682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_mlp": 1.06734776, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.07290364739094941, + "language_loss": 0.85516405, + "learning_rate": 0.0004760204441294247, + "loss": 0.86601269, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.17529297, + "step": 2754, + "time_per_iteration": 2.728672504425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095448, + "balance_loss_mlp": 1.07741165, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.0727695026629463, + "language_loss": 0.86100507, + "learning_rate": 0.00047570926498626486, + "loss": 0.87195957, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.18066406, + "step": 2755, + "time_per_iteration": 2.726902484893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099745, + "balance_loss_mlp": 1.08242369, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.05921570741986168, + "language_loss": 0.81395233, + "learning_rate": 0.00047539809527360474, + "loss": 0.82494974, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.17333984, + "step": 2756, + "time_per_iteration": 2.87945556640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115628, + "balance_loss_mlp": 1.09774637, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.05551434768366506, + "language_loss": 0.82287431, + "learning_rate": 0.0004750869351122511, + "loss": 0.83403063, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.17883301, + "step": 2757, + "time_per_iteration": 3.0493249893188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112447, + "balance_loss_mlp": 1.10749459, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.0694425557197165, + "language_loss": 0.82020032, + "learning_rate": 0.00047477578462300685, + "loss": 0.83144498, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.16992188, + "step": 2758, + "time_per_iteration": 2.7602713108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123309, + "balance_loss_mlp": 1.10578477, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.07804964416900076, + "language_loss": 0.79339695, + "learning_rate": 0.0004744646439266718, + "loss": 0.80463004, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.17541504, + "step": 2759, + "time_per_iteration": 3.010812997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119146, + "balance_loss_mlp": 1.10195613, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.056360612774155563, + "language_loss": 0.92028886, + "learning_rate": 0.000474153513144041, + "loss": 0.93148029, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.17199707, + "step": 2760, + "time_per_iteration": 2.9704673290252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128006, + "balance_loss_mlp": 1.11117363, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.08001771173719906, + "language_loss": 0.86726296, + "learning_rate": 0.00047384239239590633, + "loss": 0.87854302, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.16845703, + "step": 2761, + "time_per_iteration": 2.891458749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129372, + "balance_loss_mlp": 1.11169338, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06781273866770807, + "language_loss": 0.88723642, + "learning_rate": 0.0004735312818030556, + "loss": 0.89853013, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.17700195, + "step": 2762, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127323, + "balance_loss_mlp": 1.11076498, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.06505824064287292, + "language_loss": 0.82414401, + "learning_rate": 0.0004732201814862727, + "loss": 0.83541727, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.16564941, + "step": 2763, + "time_per_iteration": 2.726468563079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123829, + "balance_loss_mlp": 1.10723543, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.06470267434285343, + "language_loss": 0.81489587, + "learning_rate": 0.0004729090915663373, + "loss": 0.82613409, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.16601562, + "step": 2764, + "time_per_iteration": 2.8475723266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123779, + "balance_loss_mlp": 1.10759008, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.11068637871952317, + "language_loss": 0.85001844, + "learning_rate": 0.00047259801216402534, + "loss": 0.86125624, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.16186523, + "step": 2765, + "time_per_iteration": 2.540780544281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116718, + "balance_loss_mlp": 1.10029066, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.07674788190906832, + "language_loss": 0.86407942, + "learning_rate": 0.00047228694340010845, + "loss": 0.87524652, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.16430664, + "step": 2766, + "time_per_iteration": 2.590508460998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.1044749, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.07081285799421494, + "language_loss": 0.85664678, + "learning_rate": 0.0004719758853953544, + "loss": 0.86785722, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.16577148, + "step": 2767, + "time_per_iteration": 3.6536149978637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118047, + "balance_loss_mlp": 1.10160804, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.1001432749586202, + "language_loss": 0.83710611, + "learning_rate": 0.00047166483827052645, + "loss": 0.84828657, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.16442871, + "step": 2768, + "time_per_iteration": 2.437939167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234354, + "balance_loss_mlp": 1.22538948, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.06972612650118978, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78312844, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.08984375, + "step": 2769, + "time_per_iteration": 5.026838779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115645, + "balance_loss_mlp": 1.09895587, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.0780544569178282, + "language_loss": 0.83743083, + "learning_rate": 0.000471042777143682, + "loss": 0.84858727, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.16699219, + "step": 2770, + "time_per_iteration": 3.230933427810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113101, + "balance_loss_mlp": 1.09710324, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.20675341395216595, + "language_loss": 0.79602915, + "learning_rate": 0.0004707317633831707, + "loss": 0.80716014, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.15991211, + "step": 2771, + "time_per_iteration": 2.6368706226348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106292, + "balance_loss_mlp": 1.09012711, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.0712649510509903, + "language_loss": 0.77926189, + "learning_rate": 0.00047042076098559673, + "loss": 0.79032481, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.16162109, + "step": 2772, + "time_per_iteration": 2.633755683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105276, + "balance_loss_mlp": 1.08895612, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.08177633680773212, + "language_loss": 0.74153018, + "learning_rate": 0.00047010977007170174, + "loss": 0.75258291, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.16320801, + "step": 2773, + "time_per_iteration": 3.257364273071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105171, + "balance_loss_mlp": 1.08880353, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.08878543355304569, + "language_loss": 0.8234973, + "learning_rate": 0.00046979879076222334, + "loss": 0.83454895, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.16369629, + "step": 2774, + "time_per_iteration": 2.6948111057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115652, + "balance_loss_mlp": 1.09958255, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.07031279684874672, + "language_loss": 0.84660083, + "learning_rate": 0.0004694878231778939, + "loss": 0.85775733, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.16064453, + "step": 2775, + "time_per_iteration": 3.391101121902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111582, + "balance_loss_mlp": 1.09510732, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.06461927889010362, + "language_loss": 0.84379047, + "learning_rate": 0.0004691768674394423, + "loss": 0.85490632, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.16479492, + "step": 2776, + "time_per_iteration": 2.9977481365203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_mlp": 1.03071785, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.02105469632037268, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85523784, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.09082031, + "step": 2777, + "time_per_iteration": 4.769741535186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_mlp": 1.02591205, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.019005935883373085, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77688694, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.09228516, + "step": 2778, + "time_per_iteration": 4.987689733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118552, + "balance_loss_mlp": 1.10211313, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.06371644955079436, + "language_loss": 0.79125863, + "learning_rate": 0.00046824407250656676, + "loss": 0.80244416, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.16442871, + "step": 2779, + "time_per_iteration": 2.6410112380981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112995, + "balance_loss_mlp": 1.09662735, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.060742687445953125, + "language_loss": 0.83655095, + "learning_rate": 0.0004679331653588161, + "loss": 0.84768081, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.16369629, + "step": 2780, + "time_per_iteration": 2.625710964202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112315, + "balance_loss_mlp": 1.09542346, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.07272998333963254, + "language_loss": 0.85177255, + "learning_rate": 0.0004676222706605147, + "loss": 0.86289573, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.16906738, + "step": 2781, + "time_per_iteration": 2.673433542251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110827, + "balance_loss_mlp": 1.09407806, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.07193058078875894, + "language_loss": 0.85307002, + "learning_rate": 0.0004673113885323626, + "loss": 0.8641783, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.16748047, + "step": 2782, + "time_per_iteration": 2.8941664695739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106993, + "balance_loss_mlp": 1.09025598, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.10372367104553785, + "language_loss": 0.78561115, + "learning_rate": 0.00046700051909505494, + "loss": 0.79668105, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.16748047, + "step": 2783, + "time_per_iteration": 3.2081563472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111085, + "balance_loss_mlp": 1.09330261, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06865237294530599, + "language_loss": 0.83605123, + "learning_rate": 0.000466689662469282, + "loss": 0.84715974, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.17553711, + "step": 2784, + "time_per_iteration": 2.6711413860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104532, + "balance_loss_mlp": 1.08773518, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.08186219318834767, + "language_loss": 0.83921355, + "learning_rate": 0.00046637881877572917, + "loss": 0.85025889, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.16809082, + "step": 2785, + "time_per_iteration": 3.1084179878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094145, + "balance_loss_mlp": 1.07644248, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.07421115565240126, + "language_loss": 0.84573698, + "learning_rate": 0.0004660679881350764, + "loss": 0.85667843, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.17736816, + "step": 2786, + "time_per_iteration": 2.7627315521240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_mlp": 1.02681208, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.02311153951998418, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76644635, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.09667969, + "step": 2787, + "time_per_iteration": 5.0513763427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086082, + "balance_loss_mlp": 1.06855869, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07609779475010685, + "language_loss": 0.77801538, + "learning_rate": 0.0004654463664951667, + "loss": 0.78887624, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.17541504, + "step": 2788, + "time_per_iteration": 3.050717353820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085404, + "balance_loss_mlp": 1.06829762, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06896319927596091, + "language_loss": 0.82818955, + "learning_rate": 0.0004651355757372447, + "loss": 0.83904356, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.17126465, + "step": 2789, + "time_per_iteration": 2.621809244155884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108222, + "balance_loss_mlp": 1.064816, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.06368186458097214, + "language_loss": 0.85671151, + "learning_rate": 0.00046482479851489274, + "loss": 0.86753374, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.17431641, + "step": 2790, + "time_per_iteration": 2.6873245239257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107657, + "balance_loss_mlp": 1.05957103, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.09368235748008798, + "language_loss": 0.77583152, + "learning_rate": 0.00046451403494876525, + "loss": 0.78659725, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.17016602, + "step": 2791, + "time_per_iteration": 2.9352025985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073051, + "balance_loss_mlp": 1.05570602, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.09106511666805264, + "language_loss": 0.84479213, + "learning_rate": 0.0004642032851595111, + "loss": 0.85552263, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.17358398, + "step": 2792, + "time_per_iteration": 2.8460757732391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107606, + "balance_loss_mlp": 1.05853653, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.09557816920928826, + "language_loss": 0.84886861, + "learning_rate": 0.00046389254926777404, + "loss": 0.85962915, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.17541504, + "step": 2793, + "time_per_iteration": 2.8258917331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_mlp": 1.05381024, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.10419489870866282, + "language_loss": 0.78006279, + "learning_rate": 0.0004635818273941926, + "loss": 0.79077744, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.17675781, + "step": 2794, + "time_per_iteration": 3.5380136966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073554, + "balance_loss_mlp": 1.05581546, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.09943669711596623, + "language_loss": 0.81746304, + "learning_rate": 0.0004632711196593997, + "loss": 0.82819855, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.1776123, + "step": 2795, + "time_per_iteration": 2.780565023422241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076704, + "balance_loss_mlp": 1.05881083, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.08810005094672828, + "language_loss": 0.85034251, + "learning_rate": 0.00046296042618402297, + "loss": 0.86110961, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.17907715, + "step": 2796, + "time_per_iteration": 3.099726915359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076408, + "balance_loss_mlp": 1.0591228, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.06043623665913195, + "language_loss": 0.79098737, + "learning_rate": 0.0004626497470886839, + "loss": 0.80175149, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.17297363, + "step": 2797, + "time_per_iteration": 2.975820541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.06634785168506467, + "language_loss": 0.81794053, + "learning_rate": 0.00046233908249399897, + "loss": 0.82876945, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.1706543, + "step": 2798, + "time_per_iteration": 2.7805473804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086325, + "balance_loss_mlp": 1.06942129, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.07252012949911142, + "language_loss": 0.78733051, + "learning_rate": 0.00046202843252057905, + "loss": 0.79819375, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.16906738, + "step": 2799, + "time_per_iteration": 2.666600227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091679, + "balance_loss_mlp": 1.07437015, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.07864108960704319, + "language_loss": 0.83561981, + "learning_rate": 0.00046171779728902896, + "loss": 0.84653658, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.17333984, + "step": 2800, + "time_per_iteration": 2.6010262966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094958, + "balance_loss_mlp": 1.07766032, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.11618067186279732, + "language_loss": 0.85997868, + "learning_rate": 0.000461407176919948, + "loss": 0.87092829, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.17321777, + "step": 2801, + "time_per_iteration": 2.5429272651672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094632, + "balance_loss_mlp": 1.07774007, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.08430832790687283, + "language_loss": 0.84795403, + "learning_rate": 0.00046109657153392997, + "loss": 0.85890037, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.16906738, + "step": 2802, + "time_per_iteration": 2.6846201419830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108893, + "balance_loss_mlp": 1.07168102, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.08650976784842915, + "language_loss": 0.82548422, + "learning_rate": 0.0004607859812515622, + "loss": 0.83637351, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.17272949, + "step": 2803, + "time_per_iteration": 2.5817925930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107998, + "balance_loss_mlp": 1.06338573, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.07563802138366026, + "language_loss": 0.87865353, + "learning_rate": 0.00046047540619342667, + "loss": 0.88945341, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.16601562, + "step": 2804, + "time_per_iteration": 2.6165053844451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_mlp": 1.06755948, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.07064105870143675, + "language_loss": 0.79886174, + "learning_rate": 0.00046016484648009933, + "loss": 0.8097012, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.16394043, + "step": 2805, + "time_per_iteration": 2.725764274597168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084411, + "balance_loss_mlp": 1.06835365, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.07630556738551086, + "language_loss": 0.80977762, + "learning_rate": 0.0004598543022321501, + "loss": 0.82062167, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.16052246, + "step": 2806, + "time_per_iteration": 2.6351540088653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085497, + "balance_loss_mlp": 1.06909394, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.0649087683342786, + "language_loss": 0.79606426, + "learning_rate": 0.0004595437735701433, + "loss": 0.80691922, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.1640625, + "step": 2807, + "time_per_iteration": 2.706876516342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_mlp": 1.06884575, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.08230029830948764, + "language_loss": 0.83224154, + "learning_rate": 0.00045923326061463623, + "loss": 0.84309381, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.16381836, + "step": 2808, + "time_per_iteration": 2.7869887351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.07481909, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.06556687541720137, + "language_loss": 0.81677991, + "learning_rate": 0.00045892276348618113, + "loss": 0.82769144, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.16333008, + "step": 2809, + "time_per_iteration": 3.031975269317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_mlp": 1.03327227, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.026553937309941048, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79302251, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.08154297, + "step": 2810, + "time_per_iteration": 5.018324613571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097064, + "balance_loss_mlp": 1.08080387, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.07012301152495938, + "language_loss": 0.80724698, + "learning_rate": 0.000458301817192603, + "loss": 0.81821764, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.16259766, + "step": 2811, + "time_per_iteration": 2.8826699256896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_mlp": 1.02369976, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.020407688998465158, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81873488, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.08007812, + "step": 2812, + "time_per_iteration": 4.821629762649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094803, + "balance_loss_mlp": 1.07879376, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.09349970811932752, + "language_loss": 0.87107521, + "learning_rate": 0.00045768093565369983, + "loss": 0.88202327, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.16003418, + "step": 2813, + "time_per_iteration": 2.798082113265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096657, + "balance_loss_mlp": 1.08068299, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.08975534837118274, + "language_loss": 0.8179177, + "learning_rate": 0.0004573705194685646, + "loss": 0.82888424, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.15966797, + "step": 2814, + "time_per_iteration": 2.7093122005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07979465, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.07912714625539458, + "language_loss": 0.85284495, + "learning_rate": 0.00045706011983366157, + "loss": 0.86380327, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.16027832, + "step": 2815, + "time_per_iteration": 2.736809253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098087, + "balance_loss_mlp": 1.08264983, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.08398974332430421, + "language_loss": 0.82530612, + "learning_rate": 0.00045674973686949847, + "loss": 0.83628702, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.1541748, + "step": 2816, + "time_per_iteration": 2.531439781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105366, + "balance_loss_mlp": 1.08896279, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.06449066246678943, + "language_loss": 0.85269451, + "learning_rate": 0.0004564393706965766, + "loss": 0.86374819, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.1640625, + "step": 2817, + "time_per_iteration": 3.0000851154327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112663, + "balance_loss_mlp": 1.0963788, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.0725055130640743, + "language_loss": 0.81484962, + "learning_rate": 0.00045612902143539116, + "loss": 0.82597625, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.1628418, + "step": 2818, + "time_per_iteration": 2.5587399005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117291, + "balance_loss_mlp": 1.10132849, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.0784970788328837, + "language_loss": 0.81825465, + "learning_rate": 0.00045581868920642986, + "loss": 0.82942754, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.1595459, + "step": 2819, + "time_per_iteration": 2.4901785850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126012, + "balance_loss_mlp": 1.11031175, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.09999971886905719, + "language_loss": 0.79204059, + "learning_rate": 0.00045550837413017457, + "loss": 0.80330074, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.15686035, + "step": 2820, + "time_per_iteration": 2.616154909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113009, + "balance_loss_mlp": 1.11416399, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06819679789144961, + "language_loss": 0.85130954, + "learning_rate": 0.0004551980763271005, + "loss": 0.86261046, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.15917969, + "step": 2821, + "time_per_iteration": 2.655139923095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125407, + "balance_loss_mlp": 1.10927796, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.0864844698510893, + "language_loss": 0.83889675, + "learning_rate": 0.0004548877959176756, + "loss": 0.85015082, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.16125488, + "step": 2822, + "time_per_iteration": 2.8853647708892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.10281217, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.08050409404863457, + "language_loss": 0.8577252, + "learning_rate": 0.00045457753302236166, + "loss": 0.86891484, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.16149902, + "step": 2823, + "time_per_iteration": 2.6340198516845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098758, + "balance_loss_mlp": 1.08265328, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.09623202069762404, + "language_loss": 0.86938739, + "learning_rate": 0.00045426728776161353, + "loss": 0.88037497, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.16101074, + "step": 2824, + "time_per_iteration": 2.792646646499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093887, + "balance_loss_mlp": 1.07741261, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.09943652396187513, + "language_loss": 0.81526875, + "learning_rate": 0.00045395706025587863, + "loss": 0.82620764, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.16479492, + "step": 2825, + "time_per_iteration": 2.6433639526367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086855, + "balance_loss_mlp": 1.07033277, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.0973793187026711, + "language_loss": 0.82506776, + "learning_rate": 0.00045364685062559843, + "loss": 0.83593631, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.1652832, + "step": 2826, + "time_per_iteration": 2.8686280250549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_mlp": 1.06635737, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.08127433233154835, + "language_loss": 0.91488934, + "learning_rate": 0.0004533366589912067, + "loss": 0.92571723, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.16442871, + "step": 2827, + "time_per_iteration": 2.9782917499542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080524, + "balance_loss_mlp": 1.06361961, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.0854569540023736, + "language_loss": 0.77591085, + "learning_rate": 0.0004530264854731306, + "loss": 0.7867161, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.16918945, + "step": 2828, + "time_per_iteration": 3.036414623260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088214, + "balance_loss_mlp": 1.07106018, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.06060788976216288, + "language_loss": 0.83699155, + "learning_rate": 0.00045271633019179034, + "loss": 0.84787375, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.17163086, + "step": 2829, + "time_per_iteration": 2.7964255809783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085625, + "balance_loss_mlp": 1.06869721, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.07110421348748326, + "language_loss": 0.87746441, + "learning_rate": 0.0004524061932675986, + "loss": 0.88832062, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.16943359, + "step": 2830, + "time_per_iteration": 2.8379290103912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108832, + "balance_loss_mlp": 1.07154715, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.09242408982484117, + "language_loss": 0.86632991, + "learning_rate": 0.00045209607482096125, + "loss": 0.87721312, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.16784668, + "step": 2831, + "time_per_iteration": 3.018829345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.06516385, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.07061707018893328, + "language_loss": 0.84004849, + "learning_rate": 0.0004517859749722772, + "loss": 0.85087609, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.17614746, + "step": 2832, + "time_per_iteration": 2.6852874755859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.06297243, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.0761986265844091, + "language_loss": 0.79247868, + "learning_rate": 0.0004514758938419376, + "loss": 0.8032847, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.17663574, + "step": 2833, + "time_per_iteration": 2.8408279418945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041827, + "balance_loss_mlp": 1.03262424, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.03242070177943237, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77962416, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.09179688, + "step": 2834, + "time_per_iteration": 4.971372842788696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079743, + "balance_loss_mlp": 1.06190884, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.12322372516304661, + "language_loss": 0.83831322, + "learning_rate": 0.00045085578821782175, + "loss": 0.84911072, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.1784668, + "step": 2835, + "time_per_iteration": 2.568789482116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021066, + "balance_loss_mlp": 1.01186323, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.019977782676812977, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77155805, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.09179688, + "step": 2836, + "time_per_iteration": 4.917972803115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05981982, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.07873848801353439, + "language_loss": 0.809609, + "learning_rate": 0.00045023575891159866, + "loss": 0.82039082, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.18347168, + "step": 2837, + "time_per_iteration": 2.723172187805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005469, + "balance_loss_mlp": 0.99645638, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.008784480510471485, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75769281, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.09033203, + "step": 2838, + "time_per_iteration": 4.9626312255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_mlp": 1.06662869, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.06459027340027895, + "language_loss": 0.77977401, + "learning_rate": 0.0004496158068861354, + "loss": 0.79062164, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.18139648, + "step": 2839, + "time_per_iteration": 2.8617422580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089506, + "balance_loss_mlp": 1.0716958, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.06807598587278012, + "language_loss": 0.8025732, + "learning_rate": 0.00044930586015455207, + "loss": 0.81346834, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.17810059, + "step": 2840, + "time_per_iteration": 2.808669328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083083, + "balance_loss_mlp": 1.06519008, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.07651604144285383, + "language_loss": 0.88620353, + "learning_rate": 0.000448995933104179, + "loss": 0.89703441, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.17907715, + "step": 2841, + "time_per_iteration": 2.877012252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091203, + "balance_loss_mlp": 1.07347631, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.06436857909350054, + "language_loss": 0.79967082, + "learning_rate": 0.00044868602585534077, + "loss": 0.81058288, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.17749023, + "step": 2842, + "time_per_iteration": 2.8602800369262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_mlp": 1.06872416, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.07724706520419639, + "language_loss": 0.88682342, + "learning_rate": 0.0004483761385283541, + "loss": 0.89768517, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.17468262, + "step": 2843, + "time_per_iteration": 2.613612413406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083541, + "balance_loss_mlp": 1.06613624, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.07006219963607276, + "language_loss": 0.81547797, + "learning_rate": 0.0004480662712435281, + "loss": 0.82631338, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.17419434, + "step": 2844, + "time_per_iteration": 2.754683256149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084106, + "balance_loss_mlp": 1.0670594, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.0733295738661856, + "language_loss": 0.88330519, + "learning_rate": 0.0004477564241211635, + "loss": 0.89414632, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.1706543, + "step": 2845, + "time_per_iteration": 2.6289172172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_mlp": 1.06219196, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.07864053458548881, + "language_loss": 0.8673318, + "learning_rate": 0.0004474465972815541, + "loss": 0.87812233, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.16870117, + "step": 2846, + "time_per_iteration": 2.560227870941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082496, + "balance_loss_mlp": 1.06498456, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.07175771823025028, + "language_loss": 0.87547499, + "learning_rate": 0.000447136790844985, + "loss": 0.88629997, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.17529297, + "step": 2847, + "time_per_iteration": 2.677354574203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084037, + "balance_loss_mlp": 1.0662632, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.07349860951266184, + "language_loss": 0.80877674, + "learning_rate": 0.00044682700493173385, + "loss": 0.81961715, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.17785645, + "step": 2848, + "time_per_iteration": 2.8295233249664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085345, + "balance_loss_mlp": 1.06835747, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.14023883156705388, + "language_loss": 0.80396128, + "learning_rate": 0.00044651723966207004, + "loss": 0.81481469, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.17004395, + "step": 2849, + "time_per_iteration": 3.1462562084198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108891, + "balance_loss_mlp": 1.07174444, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.07606363506125788, + "language_loss": 0.78336805, + "learning_rate": 0.00044620749515625536, + "loss": 0.79425722, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.17163086, + "step": 2850, + "time_per_iteration": 2.7834317684173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010911, + "balance_loss_mlp": 1.07376719, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.06852456667367239, + "language_loss": 0.84954178, + "learning_rate": 0.00044589777153454334, + "loss": 0.86045277, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.17346191, + "step": 2851, + "time_per_iteration": 2.760814666748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093157, + "balance_loss_mlp": 1.076015, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.07096393350950583, + "language_loss": 0.83673847, + "learning_rate": 0.00044558806891717895, + "loss": 0.84767002, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.17163086, + "step": 2852, + "time_per_iteration": 2.5164217948913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.08369744, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.07126320694951607, + "language_loss": 0.79487526, + "learning_rate": 0.0004452783874243998, + "loss": 0.80588323, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.17102051, + "step": 2853, + "time_per_iteration": 2.8530960083007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103909, + "balance_loss_mlp": 1.08725584, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.08398495342430926, + "language_loss": 0.84832799, + "learning_rate": 0.00044496872717643475, + "loss": 0.85936707, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.16662598, + "step": 2854, + "time_per_iteration": 2.7308356761932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148414, + "balance_loss_mlp": 1.13902032, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.045162076754917825, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78237706, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.09375, + "step": 2855, + "time_per_iteration": 4.96257209777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110869, + "balance_loss_mlp": 1.0924654, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.08468526373475738, + "language_loss": 0.81551182, + "learning_rate": 0.0004443494708958217, + "loss": 0.8265987, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.16223145, + "step": 2856, + "time_per_iteration": 3.0704264640808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101313, + "balance_loss_mlp": 1.08494544, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.07044544020385766, + "language_loss": 0.8094157, + "learning_rate": 0.0004440398751035906, + "loss": 0.82042885, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.16369629, + "step": 2857, + "time_per_iteration": 2.971601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089993, + "balance_loss_mlp": 1.07342279, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.09537197244188163, + "language_loss": 0.83738565, + "learning_rate": 0.00044373030103700645, + "loss": 0.84828568, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.16577148, + "step": 2858, + "time_per_iteration": 2.6193714141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082176, + "balance_loss_mlp": 1.06564164, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.080765091719421, + "language_loss": 0.79399335, + "learning_rate": 0.000443420748816257, + "loss": 0.80481505, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.16540527, + "step": 2859, + "time_per_iteration": 2.8064911365509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080106, + "balance_loss_mlp": 1.06258249, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.073148777328263, + "language_loss": 0.78411651, + "learning_rate": 0.0004431112185615208, + "loss": 0.79491758, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.17541504, + "step": 2860, + "time_per_iteration": 2.8055756092071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075442, + "balance_loss_mlp": 1.05794191, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.07383159181316334, + "language_loss": 0.80081785, + "learning_rate": 0.00044280171039296845, + "loss": 0.81157225, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.17504883, + "step": 2861, + "time_per_iteration": 2.643036127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107612, + "balance_loss_mlp": 1.05894184, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.07661018407476591, + "language_loss": 0.88472402, + "learning_rate": 0.0004424922244307616, + "loss": 0.89548522, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.171875, + "step": 2862, + "time_per_iteration": 2.735457181930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071011, + "balance_loss_mlp": 1.05303383, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.07542764443639904, + "language_loss": 0.82038581, + "learning_rate": 0.00044218276079505315, + "loss": 0.83109593, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.17980957, + "step": 2863, + "time_per_iteration": 2.8912277221679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074407, + "balance_loss_mlp": 1.05706251, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.07733612279333801, + "language_loss": 0.74451876, + "learning_rate": 0.0004418733196059876, + "loss": 0.75526285, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.17358398, + "step": 2864, + "time_per_iteration": 2.7250518798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072289, + "balance_loss_mlp": 1.0549556, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.07639087544106095, + "language_loss": 0.79757476, + "learning_rate": 0.0004415639009837008, + "loss": 0.80829769, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.17358398, + "step": 2865, + "time_per_iteration": 2.864443302154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080254, + "balance_loss_mlp": 1.06293249, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.10225669356006223, + "language_loss": 0.81241995, + "learning_rate": 0.00044125450504831955, + "loss": 0.82322252, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.17346191, + "step": 2866, + "time_per_iteration": 2.757418394088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106921, + "balance_loss_mlp": 1.05211556, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.07466053084799135, + "language_loss": 0.82329029, + "learning_rate": 0.0004409451319199622, + "loss": 0.83398235, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.17102051, + "step": 2867, + "time_per_iteration": 2.6991469860076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076691, + "balance_loss_mlp": 1.05928612, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07186936074556817, + "language_loss": 0.84288383, + "learning_rate": 0.0004406357817187381, + "loss": 0.85365069, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.17419434, + "step": 2868, + "time_per_iteration": 3.0115489959716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080563, + "balance_loss_mlp": 1.06333685, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.0781084398751081, + "language_loss": 0.81316972, + "learning_rate": 0.0004403264545647474, + "loss": 0.82397532, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.17224121, + "step": 2869, + "time_per_iteration": 3.5515377521514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076801, + "balance_loss_mlp": 1.05957544, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.2476039521732135, + "language_loss": 0.84535432, + "learning_rate": 0.00044001715057808154, + "loss": 0.85612237, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.17236328, + "step": 2870, + "time_per_iteration": 2.784949541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081188, + "balance_loss_mlp": 1.06391478, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.06269874774360217, + "language_loss": 0.81665605, + "learning_rate": 0.0004397078698788232, + "loss": 0.82746798, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.17285156, + "step": 2871, + "time_per_iteration": 3.2355031967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_mlp": 1.02401352, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.01828848292268018, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81475484, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.09130859, + "step": 2872, + "time_per_iteration": 4.935345888137817 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102138, + "balance_loss_mlp": 1.08499527, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.07166089349392388, + "language_loss": 0.77967119, + "learning_rate": 0.00043908937882281343, + "loss": 0.79069257, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.17150879, + "step": 2873, + "time_per_iteration": 2.6478757858276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109525, + "balance_loss_mlp": 1.0917629, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.0876696984943119, + "language_loss": 0.8235116, + "learning_rate": 0.0004387801687061814, + "loss": 0.83460689, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.17773438, + "step": 2874, + "time_per_iteration": 2.8796098232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117288, + "balance_loss_mlp": 1.09996676, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.10934470386726207, + "language_loss": 0.80325609, + "learning_rate": 0.0004384709823571958, + "loss": 0.81442899, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.17321777, + "step": 2875, + "time_per_iteration": 2.7749507427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116347, + "balance_loss_mlp": 1.09927666, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.09489557943610515, + "language_loss": 0.82828677, + "learning_rate": 0.0004381618198958932, + "loss": 0.83945024, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.17089844, + "step": 2876, + "time_per_iteration": 3.550828218460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113394, + "balance_loss_mlp": 1.09662116, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.0896519056563172, + "language_loss": 0.83453453, + "learning_rate": 0.00043785268144230137, + "loss": 0.84566844, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.16784668, + "step": 2877, + "time_per_iteration": 2.934293270111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100462, + "balance_loss_mlp": 1.08360553, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.09194081720705921, + "language_loss": 0.8212803, + "learning_rate": 0.00043754356711643837, + "loss": 0.83228499, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.16870117, + "step": 2878, + "time_per_iteration": 2.7139456272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100534, + "balance_loss_mlp": 1.08367825, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.06610172637947556, + "language_loss": 0.83962673, + "learning_rate": 0.0004372344770383132, + "loss": 0.85063207, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.16870117, + "step": 2879, + "time_per_iteration": 2.848620891571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093635, + "balance_loss_mlp": 1.07679105, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.058036155609321634, + "language_loss": 0.82615423, + "learning_rate": 0.00043692541132792507, + "loss": 0.83709061, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.1685791, + "step": 2880, + "time_per_iteration": 2.713151693344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091805, + "balance_loss_mlp": 1.07453132, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.07516039196528058, + "language_loss": 0.83473843, + "learning_rate": 0.00043661637010526384, + "loss": 0.84565651, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.17285156, + "step": 2881, + "time_per_iteration": 2.500458240509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109005, + "balance_loss_mlp": 1.07309878, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.06896643795770978, + "language_loss": 0.83134168, + "learning_rate": 0.00043630735349031025, + "loss": 0.84224218, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.16967773, + "step": 2882, + "time_per_iteration": 2.7521300315856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.07317972, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.0736705000466592, + "language_loss": 0.81719375, + "learning_rate": 0.00043599836160303495, + "loss": 0.82809222, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.16674805, + "step": 2883, + "time_per_iteration": 2.927696704864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092625, + "balance_loss_mlp": 1.07550669, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.07830589066561539, + "language_loss": 0.77380168, + "learning_rate": 0.0004356893945633995, + "loss": 0.78472787, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.17126465, + "step": 2884, + "time_per_iteration": 2.9854161739349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.07886314, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.06846026312584631, + "language_loss": 0.81705189, + "learning_rate": 0.0004353804524913551, + "loss": 0.82800889, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.1685791, + "step": 2885, + "time_per_iteration": 2.6230812072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109452, + "balance_loss_mlp": 1.07769918, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.07648898628472602, + "language_loss": 0.81513786, + "learning_rate": 0.0004350715355068441, + "loss": 0.82608306, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.16821289, + "step": 2886, + "time_per_iteration": 2.7672505378723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088661, + "balance_loss_mlp": 1.07191217, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.09976401172783889, + "language_loss": 0.79409927, + "learning_rate": 0.00043476264372979847, + "loss": 0.80498588, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.16760254, + "step": 2887, + "time_per_iteration": 2.5482900142669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108678, + "balance_loss_mlp": 1.07004309, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.07823105816490118, + "language_loss": 0.78681719, + "learning_rate": 0.0004344537772801408, + "loss": 0.79768503, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.16748047, + "step": 2888, + "time_per_iteration": 3.8460328578948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021438, + "balance_loss_mlp": 1.01290298, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.01755933384686064, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74443889, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.08544922, + "step": 2889, + "time_per_iteration": 4.991191625595093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090362, + "balance_loss_mlp": 1.07311237, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.07150457401269486, + "language_loss": 0.83297288, + "learning_rate": 0.0004338361208426298, + "loss": 0.84387648, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.17272949, + "step": 2890, + "time_per_iteration": 2.6730411052703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108659, + "balance_loss_mlp": 1.06942344, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.07268648775014128, + "language_loss": 0.81282032, + "learning_rate": 0.00043352733109457164, + "loss": 0.82368624, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.17175293, + "step": 2891, + "time_per_iteration": 2.9306631088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094106, + "balance_loss_mlp": 1.07713079, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.057117910972540105, + "language_loss": 0.8439607, + "learning_rate": 0.00043321856715349244, + "loss": 0.85490179, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.1697998, + "step": 2892, + "time_per_iteration": 2.9671812057495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089069, + "balance_loss_mlp": 1.07197452, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.07676329529256688, + "language_loss": 0.80519265, + "learning_rate": 0.00043290982913926466, + "loss": 0.81608331, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.17089844, + "step": 2893, + "time_per_iteration": 2.853346347808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095237, + "balance_loss_mlp": 1.07807112, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.07854184605893377, + "language_loss": 0.84350514, + "learning_rate": 0.0004326011171717514, + "loss": 0.8544575, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.171875, + "step": 2894, + "time_per_iteration": 2.899630546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090371, + "balance_loss_mlp": 1.07324028, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.0742839839754536, + "language_loss": 0.80647063, + "learning_rate": 0.0004322924313708051, + "loss": 0.81737435, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.17138672, + "step": 2895, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094314, + "balance_loss_mlp": 1.07758927, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.09937187753239417, + "language_loss": 0.8452369, + "learning_rate": 0.0004319837718562681, + "loss": 0.85618007, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.1673584, + "step": 2896, + "time_per_iteration": 2.655710220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079176, + "balance_loss_mlp": 1.06149721, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.08562998531843592, + "language_loss": 0.83042324, + "learning_rate": 0.0004316751387479726, + "loss": 0.84121501, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.17700195, + "step": 2897, + "time_per_iteration": 2.7913060188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087861, + "balance_loss_mlp": 1.07069528, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.0783746969742657, + "language_loss": 0.82070696, + "learning_rate": 0.0004313665321657409, + "loss": 0.83158553, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.17175293, + "step": 2898, + "time_per_iteration": 3.726264476776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.06881404, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.0851867501114316, + "language_loss": 0.79751718, + "learning_rate": 0.00043105795222938436, + "loss": 0.80837852, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.17346191, + "step": 2899, + "time_per_iteration": 2.7452197074890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079222, + "balance_loss_mlp": 1.06218684, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.07553101492130006, + "language_loss": 0.78055334, + "learning_rate": 0.00043074939905870467, + "loss": 0.7913456, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.17053223, + "step": 2900, + "time_per_iteration": 2.6780247688293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107635, + "balance_loss_mlp": 1.05935049, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.07503151839740589, + "language_loss": 0.80663788, + "learning_rate": 0.0004304408727734927, + "loss": 0.81740135, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.17016602, + "step": 2901, + "time_per_iteration": 2.7029857635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073519, + "balance_loss_mlp": 1.05609071, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.07321045917693372, + "language_loss": 0.88611877, + "learning_rate": 0.0004301323734935288, + "loss": 0.89685392, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.17443848, + "step": 2902, + "time_per_iteration": 2.679443597793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107071, + "balance_loss_mlp": 1.05356789, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.07694594545228804, + "language_loss": 0.8710258, + "learning_rate": 0.000429823901338583, + "loss": 0.88173282, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.17150879, + "step": 2903, + "time_per_iteration": 2.627321720123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069288, + "balance_loss_mlp": 1.05181181, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.06625834371738154, + "language_loss": 0.8649714, + "learning_rate": 0.00042951545642841513, + "loss": 0.87566429, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.17492676, + "step": 2904, + "time_per_iteration": 3.0950725078582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079393, + "balance_loss_mlp": 1.06204844, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.06552893866180562, + "language_loss": 0.86677754, + "learning_rate": 0.0004292070388827737, + "loss": 0.87757146, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.17358398, + "step": 2905, + "time_per_iteration": 2.6045844554901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079778, + "balance_loss_mlp": 1.0621829, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06250610211350227, + "language_loss": 0.81015515, + "learning_rate": 0.00042889864882139753, + "loss": 0.82095295, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.17602539, + "step": 2906, + "time_per_iteration": 2.5961766242980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089486, + "balance_loss_mlp": 1.07233191, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06934465100856418, + "language_loss": 0.81378168, + "learning_rate": 0.0004285902863640139, + "loss": 0.82467651, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.17175293, + "step": 2907, + "time_per_iteration": 2.6232824325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085922, + "balance_loss_mlp": 1.06869626, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.10268967312822828, + "language_loss": 0.86113304, + "learning_rate": 0.00042828195163033966, + "loss": 0.87199223, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.17236328, + "step": 2908, + "time_per_iteration": 2.696558952331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099626, + "balance_loss_mlp": 1.08187604, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07292872799420033, + "language_loss": 0.78787363, + "learning_rate": 0.0004279736447400812, + "loss": 0.79886991, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.1776123, + "step": 2909, + "time_per_iteration": 2.5506749153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097418, + "balance_loss_mlp": 1.08000195, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.08183440800263254, + "language_loss": 0.78410208, + "learning_rate": 0.00042766536581293385, + "loss": 0.79507631, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.17431641, + "step": 2910, + "time_per_iteration": 2.762291193008423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107558, + "balance_loss_mlp": 1.09001017, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.07156517368649688, + "language_loss": 0.79594785, + "learning_rate": 0.0004273571149685819, + "loss": 0.80702341, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.17541504, + "step": 2911, + "time_per_iteration": 2.8065130710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106568, + "balance_loss_mlp": 1.08937764, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.09303022295818829, + "language_loss": 0.83760977, + "learning_rate": 0.00042704889232669937, + "loss": 0.84867543, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.17199707, + "step": 2912, + "time_per_iteration": 2.7454051971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107264, + "balance_loss_mlp": 1.09049106, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.08686899917243208, + "language_loss": 0.85566956, + "learning_rate": 0.0004267406980069484, + "loss": 0.86674225, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.16772461, + "step": 2913, + "time_per_iteration": 2.703652858734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100261, + "balance_loss_mlp": 1.08297539, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.07169329099349257, + "language_loss": 0.79587048, + "learning_rate": 0.0004264325321289808, + "loss": 0.80687308, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.17297363, + "step": 2914, + "time_per_iteration": 2.8367066383361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100801, + "balance_loss_mlp": 1.08408761, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.08752271404037346, + "language_loss": 0.85925829, + "learning_rate": 0.00042612439481243736, + "loss": 0.87026626, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.16711426, + "step": 2915, + "time_per_iteration": 2.801067590713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102823, + "balance_loss_mlp": 1.08577609, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.08075626027224062, + "language_loss": 0.89818108, + "learning_rate": 0.00042581628617694735, + "loss": 0.90920925, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.1706543, + "step": 2916, + "time_per_iteration": 2.75644588470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101283, + "balance_loss_mlp": 1.08478427, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.09688272488525364, + "language_loss": 0.82010305, + "learning_rate": 0.0004255082063421296, + "loss": 0.83111584, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.16503906, + "step": 2917, + "time_per_iteration": 2.7048747539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101411, + "balance_loss_mlp": 1.08411336, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.05911652799286667, + "language_loss": 0.84559923, + "learning_rate": 0.00042520015542759065, + "loss": 0.8566134, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.17297363, + "step": 2918, + "time_per_iteration": 2.8888731002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096781, + "balance_loss_mlp": 1.0798173, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.0855416495861322, + "language_loss": 0.87984401, + "learning_rate": 0.00042489213355292687, + "loss": 0.8908118, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.1697998, + "step": 2919, + "time_per_iteration": 2.9039535522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099415, + "balance_loss_mlp": 1.08183169, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.09901142655299539, + "language_loss": 0.80785292, + "learning_rate": 0.00042458414083772276, + "loss": 0.81884712, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.17590332, + "step": 2920, + "time_per_iteration": 2.55914306640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100735, + "balance_loss_mlp": 1.08350968, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.058059763768477664, + "language_loss": 0.84851801, + "learning_rate": 0.000424276177401552, + "loss": 0.85952532, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.17248535, + "step": 2921, + "time_per_iteration": 2.847381353378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090657, + "balance_loss_mlp": 1.07289529, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.08698061874066902, + "language_loss": 0.85584521, + "learning_rate": 0.0004239682433639763, + "loss": 0.86675179, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.17785645, + "step": 2922, + "time_per_iteration": 2.707058906555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095936, + "balance_loss_mlp": 1.07888877, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.07977820706870507, + "language_loss": 0.85277724, + "learning_rate": 0.0004236603388445467, + "loss": 0.86373651, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.1706543, + "step": 2923, + "time_per_iteration": 2.6301956176757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090666, + "balance_loss_mlp": 1.07373846, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.07720818022124956, + "language_loss": 0.81903416, + "learning_rate": 0.00042335246396280166, + "loss": 0.8299408, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.16943359, + "step": 2924, + "time_per_iteration": 2.834073066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090909, + "balance_loss_mlp": 1.07374263, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.07626854299399176, + "language_loss": 0.9026264, + "learning_rate": 0.0004230446188382693, + "loss": 0.91353548, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.171875, + "step": 2925, + "time_per_iteration": 2.6027684211730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092312, + "balance_loss_mlp": 1.07481217, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.06785040334520868, + "language_loss": 0.80436468, + "learning_rate": 0.0004227368035904654, + "loss": 0.81528783, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.17504883, + "step": 2926, + "time_per_iteration": 3.005417585372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097122, + "balance_loss_mlp": 1.0790019, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.06983498391207757, + "language_loss": 0.82735908, + "learning_rate": 0.00042242901833889474, + "loss": 0.83833027, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.18139648, + "step": 2927, + "time_per_iteration": 2.6397151947021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090818, + "balance_loss_mlp": 1.07340133, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.08127979757153865, + "language_loss": 0.85876542, + "learning_rate": 0.0004221212632030501, + "loss": 0.86967361, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.17443848, + "step": 2928, + "time_per_iteration": 3.098761558532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098859, + "balance_loss_mlp": 1.08115637, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.07359943981906872, + "language_loss": 0.80209559, + "learning_rate": 0.0004218135383024124, + "loss": 0.81308413, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.17724609, + "step": 2929, + "time_per_iteration": 2.7450544834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087151, + "balance_loss_mlp": 1.06923413, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.08357226339131614, + "language_loss": 0.85142308, + "learning_rate": 0.0004215058437564511, + "loss": 0.86229455, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.17919922, + "step": 2930, + "time_per_iteration": 2.592543125152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083944, + "balance_loss_mlp": 1.06644368, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.14879002546575693, + "language_loss": 0.82019955, + "learning_rate": 0.00042119817968462397, + "loss": 0.83103901, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.17504883, + "step": 2931, + "time_per_iteration": 2.645047187805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080791, + "balance_loss_mlp": 1.06259942, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.08065967807891394, + "language_loss": 0.86642003, + "learning_rate": 0.0004208905462063766, + "loss": 0.87722796, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.18200684, + "step": 2932, + "time_per_iteration": 2.6538538932800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108164, + "balance_loss_mlp": 1.06381869, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.07678540437917139, + "language_loss": 0.84284365, + "learning_rate": 0.00042058294344114315, + "loss": 0.85366011, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.17834473, + "step": 2933, + "time_per_iteration": 2.658790349960327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088075, + "balance_loss_mlp": 1.07069397, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.06842628935517767, + "language_loss": 0.77464747, + "learning_rate": 0.0004202753715083456, + "loss": 0.78552824, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.1739502, + "step": 2934, + "time_per_iteration": 3.0965383052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084539, + "balance_loss_mlp": 1.06742072, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.07525134320826762, + "language_loss": 0.80874884, + "learning_rate": 0.0004199678305273936, + "loss": 0.81959426, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.17126465, + "step": 2935, + "time_per_iteration": 2.6553165912628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097701, + "balance_loss_mlp": 1.08022487, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.06441901520709055, + "language_loss": 0.81395012, + "learning_rate": 0.0004196603206176854, + "loss": 0.82492715, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.17492676, + "step": 2936, + "time_per_iteration": 2.983830213546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087212, + "balance_loss_mlp": 1.07004595, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.07452375479830534, + "language_loss": 0.83586991, + "learning_rate": 0.000419352841898607, + "loss": 0.84674203, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.171875, + "step": 2937, + "time_per_iteration": 3.003563404083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089681, + "balance_loss_mlp": 1.07318234, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.07366437466259683, + "language_loss": 0.76944578, + "learning_rate": 0.000419045394489532, + "loss": 0.78034258, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.16503906, + "step": 2938, + "time_per_iteration": 2.6973941326141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089785, + "balance_loss_mlp": 1.07220173, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.09626894788078913, + "language_loss": 0.76665318, + "learning_rate": 0.0004187379785098224, + "loss": 0.77755105, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.17602539, + "step": 2939, + "time_per_iteration": 3.165407657623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089382, + "balance_loss_mlp": 1.07268023, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.07214080103004945, + "language_loss": 0.83462155, + "learning_rate": 0.00041843059407882744, + "loss": 0.84551537, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.16711426, + "step": 2940, + "time_per_iteration": 2.9633572101593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086882, + "balance_loss_mlp": 1.06998992, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.07122107277750783, + "language_loss": 0.8230179, + "learning_rate": 0.0004181232413158842, + "loss": 0.83388674, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.16906738, + "step": 2941, + "time_per_iteration": 2.6848304271698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091116, + "balance_loss_mlp": 1.07422447, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.08263268782748946, + "language_loss": 0.82281923, + "learning_rate": 0.0004178159203403179, + "loss": 0.83373046, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.16906738, + "step": 2942, + "time_per_iteration": 2.84724760055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090202, + "balance_loss_mlp": 1.07366729, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.06696308597668005, + "language_loss": 0.81382257, + "learning_rate": 0.0004175086312714409, + "loss": 0.82472456, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.16540527, + "step": 2943, + "time_per_iteration": 2.582885265350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092361, + "balance_loss_mlp": 1.0759573, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.060450118167724956, + "language_loss": 0.83769757, + "learning_rate": 0.00041720137422855366, + "loss": 0.84862119, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.1640625, + "step": 2944, + "time_per_iteration": 2.771480083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095642, + "balance_loss_mlp": 1.0798583, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.26231884968371866, + "language_loss": 0.7874673, + "learning_rate": 0.00041689414933094383, + "loss": 0.79842371, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.15771484, + "step": 2945, + "time_per_iteration": 2.6965370178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_mlp": 1.08027291, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.08450400231002299, + "language_loss": 0.81155264, + "learning_rate": 0.00041658695669788653, + "loss": 0.82251894, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.16357422, + "step": 2946, + "time_per_iteration": 2.727442741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105563, + "balance_loss_mlp": 1.08905292, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.08705150140664149, + "language_loss": 0.81145883, + "learning_rate": 0.00041627979644864453, + "loss": 0.82251441, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.16516113, + "step": 2947, + "time_per_iteration": 2.8466544151306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112751, + "balance_loss_mlp": 1.0964433, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.062214847979028806, + "language_loss": 0.8092283, + "learning_rate": 0.0004159726687024683, + "loss": 0.82035577, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.16308594, + "step": 2948, + "time_per_iteration": 2.649352788925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118496, + "balance_loss_mlp": 1.10242701, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.09810621328318807, + "language_loss": 0.79565436, + "learning_rate": 0.00041566557357859506, + "loss": 0.80683935, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.16064453, + "step": 2949, + "time_per_iteration": 2.9100704193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128592, + "balance_loss_mlp": 1.11225998, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.08040833195953295, + "language_loss": 0.79227537, + "learning_rate": 0.0004153585111962502, + "loss": 0.80356133, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.16333008, + "step": 2950, + "time_per_iteration": 3.332738161087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.11884952, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.06937214621935889, + "language_loss": 0.84358597, + "learning_rate": 0.0004150514816746453, + "loss": 0.85493875, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.16418457, + "step": 2951, + "time_per_iteration": 2.712589979171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138042, + "balance_loss_mlp": 1.12165022, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.07032847030676616, + "language_loss": 0.85400414, + "learning_rate": 0.0004147444851329802, + "loss": 0.86538458, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.16394043, + "step": 2952, + "time_per_iteration": 2.6828949451446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147692, + "balance_loss_mlp": 1.13107419, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.07370144055460691, + "language_loss": 0.85637259, + "learning_rate": 0.00041443752169044126, + "loss": 0.86784947, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.16625977, + "step": 2953, + "time_per_iteration": 3.0499908924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156702, + "balance_loss_mlp": 1.13983333, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.07840541898783242, + "language_loss": 0.84904528, + "learning_rate": 0.0004141305914662025, + "loss": 0.86061233, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.16882324, + "step": 2954, + "time_per_iteration": 2.732133626937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135414, + "balance_loss_mlp": 1.1186291, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0690175597343332, + "language_loss": 0.80056989, + "learning_rate": 0.0004138236945794246, + "loss": 0.81192404, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.16784668, + "step": 2955, + "time_per_iteration": 2.920898914337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127256, + "balance_loss_mlp": 1.1108526, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.09346989124624208, + "language_loss": 0.83651698, + "learning_rate": 0.00041351683114925576, + "loss": 0.84778959, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.1640625, + "step": 2956, + "time_per_iteration": 3.1179428100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122658, + "balance_loss_mlp": 1.10612392, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.07393250127791023, + "language_loss": 0.86702883, + "learning_rate": 0.0004132100012948308, + "loss": 0.87825537, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.16540527, + "step": 2957, + "time_per_iteration": 2.6336829662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127835, + "balance_loss_mlp": 1.11014426, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.08317259373738083, + "language_loss": 0.84444946, + "learning_rate": 0.00041290320513527145, + "loss": 0.85572779, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.17712402, + "step": 2958, + "time_per_iteration": 2.641665458679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123865, + "balance_loss_mlp": 1.10708022, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.07155108401540258, + "language_loss": 0.8494001, + "learning_rate": 0.0004125964427896867, + "loss": 0.86063874, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.16796875, + "step": 2959, + "time_per_iteration": 2.6707890033721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111469, + "balance_loss_mlp": 1.09486318, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06610188466362152, + "language_loss": 0.79023135, + "learning_rate": 0.0004122897143771723, + "loss": 0.80134606, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.1661377, + "step": 2960, + "time_per_iteration": 2.564518690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113363, + "balance_loss_mlp": 1.09644711, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.06798711275929166, + "language_loss": 0.81482321, + "learning_rate": 0.0004119830200168109, + "loss": 0.82595682, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.16931152, + "step": 2961, + "time_per_iteration": 2.6972579956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119725, + "balance_loss_mlp": 1.10334563, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.08529196588510703, + "language_loss": 0.88292432, + "learning_rate": 0.0004116763598276714, + "loss": 0.89412153, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.16381836, + "step": 2962, + "time_per_iteration": 2.5670664310455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110605, + "balance_loss_mlp": 1.09353447, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.06258641476293567, + "language_loss": 0.80866015, + "learning_rate": 0.00041136973392881017, + "loss": 0.81976616, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.17077637, + "step": 2963, + "time_per_iteration": 2.883714437484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106776, + "balance_loss_mlp": 1.08975244, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.07231503990514958, + "language_loss": 0.81792593, + "learning_rate": 0.00041106314243926983, + "loss": 0.82899374, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.17041016, + "step": 2964, + "time_per_iteration": 2.7783985137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105614, + "balance_loss_mlp": 1.08862686, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.0703519634607743, + "language_loss": 0.87298268, + "learning_rate": 0.0004107565854780798, + "loss": 0.88403881, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.17004395, + "step": 2965, + "time_per_iteration": 2.6647095680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105743, + "balance_loss_mlp": 1.08862448, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.10409226913166654, + "language_loss": 0.81182659, + "learning_rate": 0.000410450063164256, + "loss": 0.82288408, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.17126465, + "step": 2966, + "time_per_iteration": 2.866602659225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104134, + "balance_loss_mlp": 1.08703911, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.07688057786324835, + "language_loss": 0.82004988, + "learning_rate": 0.00041014357561680115, + "loss": 0.83109128, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.17114258, + "step": 2967, + "time_per_iteration": 2.5523133277893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109926, + "balance_loss_mlp": 1.09312987, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.0904159605578498, + "language_loss": 0.86166346, + "learning_rate": 0.0004098371229547039, + "loss": 0.87276274, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.16809082, + "step": 2968, + "time_per_iteration": 2.724207878112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_mlp": 1.022156, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.013041633212772678, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81042308, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.09326172, + "step": 2969, + "time_per_iteration": 4.806856155395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113794, + "balance_loss_mlp": 1.09678328, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.07993701822539574, + "language_loss": 0.80239302, + "learning_rate": 0.00040922432276247107, + "loss": 0.81353092, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.17028809, + "step": 2970, + "time_per_iteration": 2.603079319000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119244, + "balance_loss_mlp": 1.1021136, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.07050688201783964, + "language_loss": 0.84539342, + "learning_rate": 0.0004089179754702457, + "loss": 0.85658586, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.17150879, + "step": 2971, + "time_per_iteration": 2.806685209274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125569, + "balance_loss_mlp": 1.10841513, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.1127525051095751, + "language_loss": 0.79654694, + "learning_rate": 0.00040861166353919843, + "loss": 0.80780256, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.17175293, + "step": 2972, + "time_per_iteration": 2.822960138320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122737, + "balance_loss_mlp": 1.10572612, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.06522156109142956, + "language_loss": 0.81529987, + "learning_rate": 0.00040830538708824983, + "loss": 0.8265273, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.17028809, + "step": 2973, + "time_per_iteration": 2.883183479309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114225, + "balance_loss_mlp": 1.09716594, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.05988777943056807, + "language_loss": 0.81712234, + "learning_rate": 0.000407999146236307, + "loss": 0.82826465, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.17077637, + "step": 2974, + "time_per_iteration": 2.583639144897461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113969, + "balance_loss_mlp": 1.09735084, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.08488733778098946, + "language_loss": 0.83322281, + "learning_rate": 0.0004076929411022634, + "loss": 0.84436244, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.16625977, + "step": 2975, + "time_per_iteration": 2.6634230613708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117906, + "balance_loss_mlp": 1.10096645, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.10471513442043413, + "language_loss": 0.7910713, + "learning_rate": 0.0004073867718049982, + "loss": 0.80225033, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.16955566, + "step": 2976, + "time_per_iteration": 3.101864814758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116871, + "balance_loss_mlp": 1.10026503, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.08664196816998121, + "language_loss": 0.82484782, + "learning_rate": 0.00040708063846337704, + "loss": 0.83601654, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.1661377, + "step": 2977, + "time_per_iteration": 2.7438297271728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106932, + "balance_loss_mlp": 1.08967066, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.07799786255299582, + "language_loss": 0.81199914, + "learning_rate": 0.00040677454119625143, + "loss": 0.8230685, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.17285156, + "step": 2978, + "time_per_iteration": 2.5837550163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095322, + "balance_loss_mlp": 1.07809663, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.1059947946829761, + "language_loss": 0.82621056, + "learning_rate": 0.0004064684801224587, + "loss": 0.83716381, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.17236328, + "step": 2979, + "time_per_iteration": 2.6220715045928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095905, + "balance_loss_mlp": 1.07850003, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.06700215842091113, + "language_loss": 0.80611891, + "learning_rate": 0.00040616245536082224, + "loss": 0.81707793, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.17431641, + "step": 2980, + "time_per_iteration": 2.6067917346954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086913, + "balance_loss_mlp": 1.069556, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.19945027498537377, + "language_loss": 0.81268358, + "learning_rate": 0.00040585646703015165, + "loss": 0.82355273, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.17370605, + "step": 2981, + "time_per_iteration": 2.910644769668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087867, + "balance_loss_mlp": 1.07096314, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.06421268852729406, + "language_loss": 0.78161913, + "learning_rate": 0.0004055505152492419, + "loss": 0.79249781, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.16918945, + "step": 2982, + "time_per_iteration": 2.6653785705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084086, + "balance_loss_mlp": 1.06670547, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.08054865949602324, + "language_loss": 0.73896229, + "learning_rate": 0.00040524460013687425, + "loss": 0.74980319, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.1739502, + "step": 2983, + "time_per_iteration": 2.721282958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090667, + "balance_loss_mlp": 1.07357204, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.08106324915579151, + "language_loss": 0.81038249, + "learning_rate": 0.0004049387218118155, + "loss": 0.82128918, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.17102051, + "step": 2984, + "time_per_iteration": 2.9739558696746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109026, + "balance_loss_mlp": 1.07321286, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07771926917330779, + "language_loss": 0.84678066, + "learning_rate": 0.00040463288039281777, + "loss": 0.85768324, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.1706543, + "step": 2985, + "time_per_iteration": 2.755789279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049819, + "balance_loss_mlp": 1.0396148, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.027186215876947157, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78926235, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.10205078, + "step": 2986, + "time_per_iteration": 5.024104833602905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102309, + "balance_loss_mlp": 1.08496404, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.07406110021904912, + "language_loss": 0.82250667, + "learning_rate": 0.0004040213087479444, + "loss": 0.83352977, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.17346191, + "step": 2987, + "time_per_iteration": 2.954012632369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110505, + "balance_loss_mlp": 1.0885036, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.08213209001088305, + "language_loss": 0.85105377, + "learning_rate": 0.0004037155787595018, + "loss": 0.86210424, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.16552734, + "step": 2988, + "time_per_iteration": 2.596590757369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103556, + "balance_loss_mlp": 1.08671117, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.06658279323725882, + "language_loss": 0.80333447, + "learning_rate": 0.000403409886151987, + "loss": 0.8143701, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.1685791, + "step": 2989, + "time_per_iteration": 2.9190666675567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049496, + "balance_loss_mlp": 1.03948224, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.024963739862010757, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.830486, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.10009766, + "step": 2990, + "time_per_iteration": 4.779403448104858 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_mlp": 1.03442252, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.02279292821926405, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79242849, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.09814453, + "step": 2991, + "time_per_iteration": 4.813813209533691 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104349, + "balance_loss_mlp": 1.08761191, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.07351496217070447, + "language_loss": 0.76526999, + "learning_rate": 0.00040249303380173807, + "loss": 0.77631354, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.16748047, + "step": 2992, + "time_per_iteration": 3.0984480381011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.08323884, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.07106147833910306, + "language_loss": 0.78964388, + "learning_rate": 0.00040218749190459126, + "loss": 0.80064261, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.16638184, + "step": 2993, + "time_per_iteration": 2.7525393962860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109225, + "balance_loss_mlp": 1.07550144, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.07997694276494066, + "language_loss": 0.82424486, + "learning_rate": 0.00040188198798162775, + "loss": 0.83516741, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.16760254, + "step": 2994, + "time_per_iteration": 2.6026856899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105077, + "balance_loss_mlp": 1.08812571, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.060991263028610375, + "language_loss": 0.85548359, + "learning_rate": 0.000401576522151455, + "loss": 0.86653435, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.16955566, + "step": 2995, + "time_per_iteration": 2.8387343883514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097873, + "balance_loss_mlp": 1.08148181, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.0649014718190417, + "language_loss": 0.82459986, + "learning_rate": 0.0004012710945326651, + "loss": 0.83557856, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.16394043, + "step": 2996, + "time_per_iteration": 2.8002259731292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.08355331, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.07884412717722156, + "language_loss": 0.80980134, + "learning_rate": 0.0004009657052438355, + "loss": 0.82079625, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.15930176, + "step": 2997, + "time_per_iteration": 2.8380162715911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106429, + "balance_loss_mlp": 1.09044361, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.09100511136442054, + "language_loss": 0.8548094, + "learning_rate": 0.00040066035440352904, + "loss": 0.86587369, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.15979004, + "step": 2998, + "time_per_iteration": 2.7165040969848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_mlp": 1.04687226, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.029413044868518267, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80347776, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.07763672, + "step": 2999, + "time_per_iteration": 4.891362905502319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105098, + "balance_loss_mlp": 1.08894527, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.08263350927787948, + "language_loss": 0.75637519, + "learning_rate": 0.00040004976854266145, + "loss": 0.76742619, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.16149902, + "step": 3000, + "time_per_iteration": 2.5579755306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105893, + "balance_loss_mlp": 1.08987141, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.06941869769704709, + "language_loss": 0.81322896, + "learning_rate": 0.0003997445337591505, + "loss": 0.82428795, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.16027832, + "step": 3001, + "time_per_iteration": 2.689349889755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104956, + "balance_loss_mlp": 1.0884937, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.09192868754767076, + "language_loss": 0.74184531, + "learning_rate": 0.0003994393378982635, + "loss": 0.75289488, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.16467285, + "step": 3002, + "time_per_iteration": 2.6561992168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074164, + "balance_loss_mlp": 1.06658196, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.035051917356449074, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80612171, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.07568359, + "step": 3003, + "time_per_iteration": 4.835859298706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101477, + "balance_loss_mlp": 1.0852406, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.07939797508674061, + "language_loss": 0.8815853, + "learning_rate": 0.0003988290634182961, + "loss": 0.89260006, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.16235352, + "step": 3004, + "time_per_iteration": 2.8315813541412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106342, + "balance_loss_mlp": 1.09034419, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.07086440080231367, + "language_loss": 0.80762905, + "learning_rate": 0.0003985239850361453, + "loss": 0.81869251, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.15991211, + "step": 3005, + "time_per_iteration": 2.6647462844848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.08430243, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.07031230145466298, + "language_loss": 0.84713155, + "learning_rate": 0.0003982189460504777, + "loss": 0.85813624, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.16162109, + "step": 3006, + "time_per_iteration": 2.70588755607605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104818, + "balance_loss_mlp": 1.08837891, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.07782537057878013, + "language_loss": 0.78822792, + "learning_rate": 0.00039791394657971935, + "loss": 0.79927599, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.16442871, + "step": 3007, + "time_per_iteration": 2.7525734901428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112062, + "balance_loss_mlp": 1.09575403, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.08023947055085524, + "language_loss": 0.84335512, + "learning_rate": 0.00039760898674228205, + "loss": 0.85447574, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.16308594, + "step": 3008, + "time_per_iteration": 2.6740429401397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.08913136, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.06481055961735596, + "language_loss": 0.80689526, + "learning_rate": 0.0003973040666565613, + "loss": 0.81794715, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.16052246, + "step": 3009, + "time_per_iteration": 3.0985798835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105331, + "balance_loss_mlp": 1.08880866, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.07104717657711816, + "language_loss": 0.8190769, + "learning_rate": 0.000396999186440938, + "loss": 0.83013022, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.1652832, + "step": 3010, + "time_per_iteration": 2.8631935119628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095613, + "balance_loss_mlp": 1.07888842, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.07539914783858101, + "language_loss": 0.85185289, + "learning_rate": 0.000396694346213777, + "loss": 0.86280894, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.1673584, + "step": 3011, + "time_per_iteration": 2.7040622234344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093449, + "balance_loss_mlp": 1.0765686, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.06256207841015303, + "language_loss": 0.83364058, + "learning_rate": 0.0003963895460934276, + "loss": 0.84457505, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.16882324, + "step": 3012, + "time_per_iteration": 3.173614025115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.07312369, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.08299946451997237, + "language_loss": 0.85058802, + "learning_rate": 0.00039608478619822376, + "loss": 0.86148685, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.16772461, + "step": 3013, + "time_per_iteration": 2.4611692428588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.065166, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.06639451681987794, + "language_loss": 0.82375103, + "learning_rate": 0.00039578006664648394, + "loss": 0.83457041, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.16784668, + "step": 3014, + "time_per_iteration": 2.789212703704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085955, + "balance_loss_mlp": 1.06965923, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.08034627380925646, + "language_loss": 0.81074166, + "learning_rate": 0.0003954753875565105, + "loss": 0.82160121, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.16296387, + "step": 3015, + "time_per_iteration": 3.1160459518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082316, + "balance_loss_mlp": 1.06503117, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.06677664636320767, + "language_loss": 0.82464337, + "learning_rate": 0.00039517074904659057, + "loss": 0.83546656, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.1730957, + "step": 3016, + "time_per_iteration": 2.716564655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087588, + "balance_loss_mlp": 1.07085133, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.0799627957481028, + "language_loss": 0.84913206, + "learning_rate": 0.00039486615123499535, + "loss": 0.86000794, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.16748047, + "step": 3017, + "time_per_iteration": 2.855402708053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079434, + "balance_loss_mlp": 1.06237507, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.08435209251616928, + "language_loss": 0.85015523, + "learning_rate": 0.00039456159423997996, + "loss": 0.86094958, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.17077637, + "step": 3018, + "time_per_iteration": 2.6843197345733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.06261373, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.06274674533021377, + "language_loss": 0.89687812, + "learning_rate": 0.00039425707817978406, + "loss": 0.90767419, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.16992188, + "step": 3019, + "time_per_iteration": 2.681183099746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076495, + "balance_loss_mlp": 1.05895901, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.14184929094941942, + "language_loss": 0.83556581, + "learning_rate": 0.00039395260317263124, + "loss": 0.84633076, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.17553711, + "step": 3020, + "time_per_iteration": 2.629709482192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073542, + "balance_loss_mlp": 1.05577993, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.08203162266100236, + "language_loss": 0.84840143, + "learning_rate": 0.0003936481693367291, + "loss": 0.85913682, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.1776123, + "step": 3021, + "time_per_iteration": 2.717710018157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083798, + "balance_loss_mlp": 1.06607115, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08298145922497896, + "language_loss": 0.87323809, + "learning_rate": 0.0003933437767902697, + "loss": 0.88407612, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.17749023, + "step": 3022, + "time_per_iteration": 2.8179917335510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.07563782, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.07663513037653054, + "language_loss": 0.77978808, + "learning_rate": 0.00039303942565142825, + "loss": 0.79071838, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.17407227, + "step": 3023, + "time_per_iteration": 2.7656824588775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092602, + "balance_loss_mlp": 1.07522154, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.09353579288790682, + "language_loss": 0.76389718, + "learning_rate": 0.0003927351160383644, + "loss": 0.77482319, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.1739502, + "step": 3024, + "time_per_iteration": 2.81196665763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096766, + "balance_loss_mlp": 1.07968342, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.05988996320852443, + "language_loss": 0.77658468, + "learning_rate": 0.000392430848069222, + "loss": 0.78755236, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.17089844, + "step": 3025, + "time_per_iteration": 2.553349733352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.07864261, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.09842162601860249, + "language_loss": 0.82432085, + "learning_rate": 0.00039212662186212795, + "loss": 0.83527917, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.17199707, + "step": 3026, + "time_per_iteration": 2.6321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096184, + "balance_loss_mlp": 1.07874346, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.06216962714468932, + "language_loss": 0.77065325, + "learning_rate": 0.0003918224375351934, + "loss": 0.78161508, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.17468262, + "step": 3027, + "time_per_iteration": 2.7319040298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_mlp": 1.08531559, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.06463813423056745, + "language_loss": 0.78389823, + "learning_rate": 0.0003915182952065135, + "loss": 0.79492265, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.17138672, + "step": 3028, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.08095205, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.07943165793883354, + "language_loss": 0.87551522, + "learning_rate": 0.0003912141949941664, + "loss": 0.8864941, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.16955566, + "step": 3029, + "time_per_iteration": 2.7122318744659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091171, + "balance_loss_mlp": 1.07376611, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.08419707099866325, + "language_loss": 0.82715654, + "learning_rate": 0.0003909101370162143, + "loss": 0.83806825, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.17431641, + "step": 3030, + "time_per_iteration": 2.6301612854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010224, + "balance_loss_mlp": 1.00211763, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.006956762065680846, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73444116, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.08105469, + "step": 3031, + "time_per_iteration": 4.870691299438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091064, + "balance_loss_mlp": 1.07400537, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.08204338633061625, + "language_loss": 0.82931381, + "learning_rate": 0.0003903021482356622, + "loss": 0.8402245, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.1706543, + "step": 3032, + "time_per_iteration": 2.829430103302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091732, + "balance_loss_mlp": 1.07503033, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.08520682753706012, + "language_loss": 0.82501173, + "learning_rate": 0.00038999821766910465, + "loss": 0.8359291, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.16711426, + "step": 3033, + "time_per_iteration": 3.0449070930480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087325, + "balance_loss_mlp": 1.07023025, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.07138585009560579, + "language_loss": 0.85493183, + "learning_rate": 0.00038969432980902606, + "loss": 0.86580509, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.17114258, + "step": 3034, + "time_per_iteration": 2.6099114418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015774, + "balance_loss_mlp": 1.00771523, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.011956814182891856, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80800277, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.08056641, + "step": 3035, + "time_per_iteration": 4.919405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.0678978, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.0762930329312805, + "language_loss": 0.82252562, + "learning_rate": 0.00038908668268020953, + "loss": 0.83336914, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.16455078, + "step": 3036, + "time_per_iteration": 2.7005980014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.06603003, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.07750025430989764, + "language_loss": 0.84744304, + "learning_rate": 0.00038878292364738097, + "loss": 0.85826999, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.16674805, + "step": 3037, + "time_per_iteration": 2.854461908340454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085343, + "balance_loss_mlp": 1.0690949, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.0866866607830145, + "language_loss": 0.86865294, + "learning_rate": 0.0003884792077928508, + "loss": 0.87950635, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.16235352, + "step": 3038, + "time_per_iteration": 2.526219606399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085659, + "balance_loss_mlp": 1.06974506, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.09714525133414084, + "language_loss": 0.76819932, + "learning_rate": 0.0003881755352345322, + "loss": 0.77905595, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.15905762, + "step": 3039, + "time_per_iteration": 2.5546979904174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086917, + "balance_loss_mlp": 1.0702157, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.09749751366402076, + "language_loss": 0.86787152, + "learning_rate": 0.0003878719060903207, + "loss": 0.87874067, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.16711426, + "step": 3040, + "time_per_iteration": 2.585848093032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091729, + "balance_loss_mlp": 1.07531416, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.0840209110893744, + "language_loss": 0.83088207, + "learning_rate": 0.0003875683204780961, + "loss": 0.84179938, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.16418457, + "step": 3041, + "time_per_iteration": 2.7646286487579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096113, + "balance_loss_mlp": 1.08006763, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.08651728983241819, + "language_loss": 0.85210633, + "learning_rate": 0.00038726477851572043, + "loss": 0.86306751, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.16040039, + "step": 3042, + "time_per_iteration": 2.797314167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101767, + "balance_loss_mlp": 1.08557868, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.08316199388994981, + "language_loss": 0.80228806, + "learning_rate": 0.0003869612803210395, + "loss": 0.81330574, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.16186523, + "step": 3043, + "time_per_iteration": 2.6490185260772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103183, + "balance_loss_mlp": 1.08701873, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.06777837645025765, + "language_loss": 0.83051372, + "learning_rate": 0.0003866578260118817, + "loss": 0.84154546, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.16162109, + "step": 3044, + "time_per_iteration": 2.6326801776885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106723, + "balance_loss_mlp": 1.09098744, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.07505807062734855, + "language_loss": 0.83121902, + "learning_rate": 0.0003863544157060581, + "loss": 0.84228623, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.15722656, + "step": 3045, + "time_per_iteration": 2.7122910022735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113369, + "balance_loss_mlp": 1.09763348, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.06825767558676081, + "language_loss": 0.81871521, + "learning_rate": 0.0003860510495213634, + "loss": 0.82984889, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.15722656, + "step": 3046, + "time_per_iteration": 2.8188610076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113296, + "balance_loss_mlp": 1.09753644, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.07680372972712284, + "language_loss": 0.7820521, + "learning_rate": 0.0003857477275755746, + "loss": 0.79318506, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.1574707, + "step": 3047, + "time_per_iteration": 2.680021047592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114239, + "balance_loss_mlp": 1.09859896, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.06132573168351462, + "language_loss": 0.83483028, + "learning_rate": 0.00038544444998645167, + "loss": 0.84597266, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.15625, + "step": 3048, + "time_per_iteration": 3.024035692214966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09482431, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.07774154556799634, + "language_loss": 0.81755519, + "learning_rate": 0.00038514121687173767, + "loss": 0.82866311, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.15966797, + "step": 3049, + "time_per_iteration": 2.602348566055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106413, + "balance_loss_mlp": 1.09079647, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.07288499528915, + "language_loss": 0.81607699, + "learning_rate": 0.00038483802834915807, + "loss": 0.82714111, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.15600586, + "step": 3050, + "time_per_iteration": 3.0202012062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102645, + "balance_loss_mlp": 1.08663559, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.06464020852625685, + "language_loss": 0.78985357, + "learning_rate": 0.00038453488453642074, + "loss": 0.80088001, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.16003418, + "step": 3051, + "time_per_iteration": 2.6733691692352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_mlp": 1.0853616, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.11499584820010532, + "language_loss": 0.86622018, + "learning_rate": 0.00038423178555121697, + "loss": 0.87723207, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.1583252, + "step": 3052, + "time_per_iteration": 2.7339212894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091842, + "balance_loss_mlp": 1.07583237, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.06975664982977658, + "language_loss": 0.85649264, + "learning_rate": 0.00038392873151121994, + "loss": 0.86741114, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.16003418, + "step": 3053, + "time_per_iteration": 3.0498745441436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094641, + "balance_loss_mlp": 1.07848823, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.07594371919491524, + "language_loss": 0.82729709, + "learning_rate": 0.0003836257225340859, + "loss": 0.83824348, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.16149902, + "step": 3054, + "time_per_iteration": 2.6312718391418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083342, + "balance_loss_mlp": 1.0662595, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.07226211151265562, + "language_loss": 0.81785333, + "learning_rate": 0.00038332275873745336, + "loss": 0.82868683, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.17102051, + "step": 3055, + "time_per_iteration": 3.0953447818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086607, + "balance_loss_mlp": 1.06990623, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.05891266503615663, + "language_loss": 0.82779503, + "learning_rate": 0.0003830198402389431, + "loss": 0.83866107, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.16711426, + "step": 3056, + "time_per_iteration": 2.7385828495025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022532, + "balance_loss_mlp": 1.01485538, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.023195211062617696, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78371465, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.07666016, + "step": 3057, + "time_per_iteration": 5.0122692584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082378, + "balance_loss_mlp": 1.06487858, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.09420327310468278, + "language_loss": 0.82856947, + "learning_rate": 0.0003824141396066855, + "loss": 0.83939326, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.17504883, + "step": 3058, + "time_per_iteration": 2.630829334259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086117, + "balance_loss_mlp": 1.06941545, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.07561205741670568, + "language_loss": 0.82764673, + "learning_rate": 0.000382111357708092, + "loss": 0.83850795, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.16711426, + "step": 3059, + "time_per_iteration": 2.754732608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079118, + "balance_loss_mlp": 1.06203532, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.07214212654246877, + "language_loss": 0.83606875, + "learning_rate": 0.00038180862157792864, + "loss": 0.84685993, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.17102051, + "step": 3060, + "time_per_iteration": 2.8452963829040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079154, + "balance_loss_mlp": 1.06195176, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.06766423660124334, + "language_loss": 0.81912309, + "learning_rate": 0.0003815059313337279, + "loss": 0.82991457, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.17224121, + "step": 3061, + "time_per_iteration": 2.699923515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075885, + "balance_loss_mlp": 1.05862319, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.05609969141419105, + "language_loss": 0.78319967, + "learning_rate": 0.00038120328709300436, + "loss": 0.79395854, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.17272949, + "step": 3062, + "time_per_iteration": 2.9140214920043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073381, + "balance_loss_mlp": 1.05580938, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.06388746068798092, + "language_loss": 0.83677167, + "learning_rate": 0.0003809006889732549, + "loss": 0.84750545, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.17590332, + "step": 3063, + "time_per_iteration": 2.812375068664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073036, + "balance_loss_mlp": 1.05551219, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.1840205152254721, + "language_loss": 0.87883544, + "learning_rate": 0.0003805981370919589, + "loss": 0.88956577, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.17529297, + "step": 3064, + "time_per_iteration": 2.5644187927246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073964, + "balance_loss_mlp": 1.05604672, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.08741335688742048, + "language_loss": 0.83813435, + "learning_rate": 0.0003802956315665771, + "loss": 0.84887397, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.17932129, + "step": 3065, + "time_per_iteration": 2.6985597610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077561, + "balance_loss_mlp": 1.0604192, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.09549414349914971, + "language_loss": 0.81565332, + "learning_rate": 0.0003799931725145529, + "loss": 0.82642901, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.17150879, + "step": 3066, + "time_per_iteration": 2.621553897857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.06172466, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.06470265589627064, + "language_loss": 0.85731423, + "learning_rate": 0.00037969076005331083, + "loss": 0.86810863, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.17736816, + "step": 3067, + "time_per_iteration": 2.7705938816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108525, + "balance_loss_mlp": 1.06776178, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.07323535980547291, + "language_loss": 0.87987936, + "learning_rate": 0.00037938839430025817, + "loss": 0.89073181, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.17504883, + "step": 3068, + "time_per_iteration": 2.6688857078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.06792498, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.13096377841439616, + "language_loss": 0.85380679, + "learning_rate": 0.0003790860753727835, + "loss": 0.86466074, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.17492676, + "step": 3069, + "time_per_iteration": 2.9018454551696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091061, + "balance_loss_mlp": 1.07345426, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0726049430242405, + "language_loss": 0.82249814, + "learning_rate": 0.00037878380338825766, + "loss": 0.83340883, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.1763916, + "step": 3070, + "time_per_iteration": 2.695953607559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095711, + "balance_loss_mlp": 1.07847357, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.07160608760806797, + "language_loss": 0.81351429, + "learning_rate": 0.00037848157846403287, + "loss": 0.82447141, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.17248535, + "step": 3071, + "time_per_iteration": 2.900130271911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096895, + "balance_loss_mlp": 1.07976437, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.08831271669304017, + "language_loss": 0.83602202, + "learning_rate": 0.0003781794007174435, + "loss": 0.846991, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.17150879, + "step": 3072, + "time_per_iteration": 2.7315585613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052839, + "balance_loss_mlp": 1.0453527, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.018548344346269084, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75127375, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.07470703, + "step": 3073, + "time_per_iteration": 4.843595027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096296, + "balance_loss_mlp": 1.07984531, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.06605464812454943, + "language_loss": 0.80771315, + "learning_rate": 0.0003775751872264152, + "loss": 0.81867611, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.16455078, + "step": 3074, + "time_per_iteration": 2.812434196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088175, + "balance_loss_mlp": 1.07113981, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.08890011139795934, + "language_loss": 0.86803812, + "learning_rate": 0.0003772731517165527, + "loss": 0.87891984, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.17041016, + "step": 3075, + "time_per_iteration": 2.8199949264526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087435, + "balance_loss_mlp": 1.07135379, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.06956331546073297, + "language_loss": 0.83378977, + "learning_rate": 0.0003769711638534784, + "loss": 0.8446641, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.16064453, + "step": 3076, + "time_per_iteration": 3.021451711654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097469, + "balance_loss_mlp": 1.08068419, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.07608235771804774, + "language_loss": 0.79065943, + "learning_rate": 0.00037666922375443446, + "loss": 0.80163419, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.16796875, + "step": 3077, + "time_per_iteration": 2.602043867111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092109, + "balance_loss_mlp": 1.076123, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.09346086613563626, + "language_loss": 0.81744075, + "learning_rate": 0.00037636733153664396, + "loss": 0.82836187, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.15979004, + "step": 3078, + "time_per_iteration": 2.8222453594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093493, + "balance_loss_mlp": 1.07719743, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.1116363853226753, + "language_loss": 0.79912782, + "learning_rate": 0.0003760654873173124, + "loss": 0.81006277, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.16296387, + "step": 3079, + "time_per_iteration": 2.6946070194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085907, + "balance_loss_mlp": 1.06951547, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.06915984482876121, + "language_loss": 0.81859291, + "learning_rate": 0.00037576369121362566, + "loss": 0.82945192, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.16394043, + "step": 3080, + "time_per_iteration": 2.6502840518951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088263, + "balance_loss_mlp": 1.07191896, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.07693331015944839, + "language_loss": 0.8159368, + "learning_rate": 0.0003754619433427516, + "loss": 0.82681942, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.16345215, + "step": 3081, + "time_per_iteration": 2.9385058879852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084208, + "balance_loss_mlp": 1.06749439, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.07095697248954357, + "language_loss": 0.77517045, + "learning_rate": 0.0003751602438218392, + "loss": 0.78601247, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.16723633, + "step": 3082, + "time_per_iteration": 2.8245561122894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083121, + "balance_loss_mlp": 1.06693244, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.1021077750392874, + "language_loss": 0.83509332, + "learning_rate": 0.0003748585927680186, + "loss": 0.8459245, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.16186523, + "step": 3083, + "time_per_iteration": 2.6818346977233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084171, + "balance_loss_mlp": 1.06721938, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.06846862154983226, + "language_loss": 0.82637662, + "learning_rate": 0.00037455699029840086, + "loss": 0.83721828, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.16967773, + "step": 3084, + "time_per_iteration": 2.6860570907592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088457, + "balance_loss_mlp": 1.07176781, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.06710726384898401, + "language_loss": 0.8462739, + "learning_rate": 0.0003742554365300787, + "loss": 0.85715848, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.16699219, + "step": 3085, + "time_per_iteration": 2.749816656112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088228, + "balance_loss_mlp": 1.07143116, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.08250802724924795, + "language_loss": 0.78595787, + "learning_rate": 0.0003739539315801255, + "loss": 0.79684019, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.16809082, + "step": 3086, + "time_per_iteration": 2.982919216156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092705, + "balance_loss_mlp": 1.07571757, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.083760246794696, + "language_loss": 0.91647482, + "learning_rate": 0.000373652475565596, + "loss": 0.9274019, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.16992188, + "step": 3087, + "time_per_iteration": 2.4816558361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_mlp": 1.08528244, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.09245346089356003, + "language_loss": 0.81352496, + "learning_rate": 0.00037335106860352587, + "loss": 0.82454908, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.17138672, + "step": 3088, + "time_per_iteration": 2.675565719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.07863212, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.10172018328041595, + "language_loss": 0.83090484, + "learning_rate": 0.00037304971081093146, + "loss": 0.84185594, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.16479492, + "step": 3089, + "time_per_iteration": 2.614063024520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102174, + "balance_loss_mlp": 1.08573484, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.09417550180705583, + "language_loss": 0.81048489, + "learning_rate": 0.00037274840230481024, + "loss": 0.82150662, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.16442871, + "step": 3090, + "time_per_iteration": 2.791287899017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106483, + "balance_loss_mlp": 1.09013939, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.08210045649904979, + "language_loss": 0.79059577, + "learning_rate": 0.00037244714320214077, + "loss": 0.80166066, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.16345215, + "step": 3091, + "time_per_iteration": 2.5437703132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101492, + "balance_loss_mlp": 1.08511281, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.06960715408232113, + "language_loss": 0.83210528, + "learning_rate": 0.000372145933619882, + "loss": 0.84312022, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.16381836, + "step": 3092, + "time_per_iteration": 2.902186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112879, + "balance_loss_mlp": 1.0964278, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.11673775861228046, + "language_loss": 0.82268316, + "learning_rate": 0.000371844773674974, + "loss": 0.833812, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.16455078, + "step": 3093, + "time_per_iteration": 2.6614809036254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116363, + "balance_loss_mlp": 1.10023379, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.0944691086002383, + "language_loss": 0.81785637, + "learning_rate": 0.0003715436634843375, + "loss": 0.82902002, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.16125488, + "step": 3094, + "time_per_iteration": 2.90022873878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117363, + "balance_loss_mlp": 1.10172296, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.057224396595454204, + "language_loss": 0.80872512, + "learning_rate": 0.00037124260316487355, + "loss": 0.81989878, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.15625, + "step": 3095, + "time_per_iteration": 2.885049819946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114273, + "balance_loss_mlp": 1.09841847, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.06086987109203959, + "language_loss": 0.89374322, + "learning_rate": 0.0003709415928334643, + "loss": 0.90488601, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.15844727, + "step": 3096, + "time_per_iteration": 2.6082546710968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011177, + "balance_loss_mlp": 1.10172629, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.09348672972793858, + "language_loss": 0.80559552, + "learning_rate": 0.00037064063260697233, + "loss": 0.81677252, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.15966797, + "step": 3097, + "time_per_iteration": 2.8901162147521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123233, + "balance_loss_mlp": 1.10749698, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.06876216438303968, + "language_loss": 0.78693187, + "learning_rate": 0.0003703397226022407, + "loss": 0.79816419, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.15722656, + "step": 3098, + "time_per_iteration": 3.066073179244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102465, + "balance_loss_mlp": 1.09416783, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.03442912107402327, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.7660234, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.08300781, + "step": 3099, + "time_per_iteration": 4.9653050899505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127115, + "balance_loss_mlp": 1.11136746, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.0680420214228425, + "language_loss": 0.8297379, + "learning_rate": 0.0003697380537253339, + "loss": 0.84100908, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.15734863, + "step": 3100, + "time_per_iteration": 2.715177059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.0978117, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.06669871573577384, + "language_loss": 0.81245238, + "learning_rate": 0.0003694372950867471, + "loss": 0.82358712, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.15649414, + "step": 3101, + "time_per_iteration": 2.8005011081695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123061, + "balance_loss_mlp": 1.10731363, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.07790109934746459, + "language_loss": 0.77269602, + "learning_rate": 0.0003691365871370976, + "loss": 0.78392667, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.15734863, + "step": 3102, + "time_per_iteration": 3.077610731124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118239, + "balance_loss_mlp": 1.10267067, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06403529919974375, + "language_loss": 0.85239542, + "learning_rate": 0.00036883592999313093, + "loss": 0.86357784, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.15551758, + "step": 3103, + "time_per_iteration": 2.6910035610198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123035, + "balance_loss_mlp": 1.10726357, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.07439514059918453, + "language_loss": 0.7913959, + "learning_rate": 0.0003685353237715722, + "loss": 0.80262625, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.15759277, + "step": 3104, + "time_per_iteration": 2.8957912921905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118284, + "balance_loss_mlp": 1.10222602, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.09765250688336868, + "language_loss": 0.81377506, + "learning_rate": 0.0003682347685891274, + "loss": 0.82495785, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.16052246, + "step": 3105, + "time_per_iteration": 2.84584379196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106359, + "balance_loss_mlp": 1.09007454, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.07268165375697674, + "language_loss": 0.805511, + "learning_rate": 0.0003679342645624822, + "loss": 0.81657457, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.1628418, + "step": 3106, + "time_per_iteration": 3.0009236335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116176, + "balance_loss_mlp": 1.09978509, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.08276382082752762, + "language_loss": 0.81614435, + "learning_rate": 0.0003676338118083025, + "loss": 0.82730609, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.16394043, + "step": 3107, + "time_per_iteration": 3.088297128677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103194, + "balance_loss_mlp": 1.08736336, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.10722680659176895, + "language_loss": 0.79196644, + "learning_rate": 0.0003673334104432347, + "loss": 0.80299842, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.15820312, + "step": 3108, + "time_per_iteration": 2.634643077850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100064, + "balance_loss_mlp": 1.08379245, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.07294397192010518, + "language_loss": 0.8350544, + "learning_rate": 0.0003670330605839048, + "loss": 0.84605503, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.16271973, + "step": 3109, + "time_per_iteration": 2.8294010162353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091507, + "balance_loss_mlp": 1.0755446, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.08059004302640393, + "language_loss": 0.76664943, + "learning_rate": 0.0003667327623469191, + "loss": 0.77756447, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.1595459, + "step": 3110, + "time_per_iteration": 2.784902334213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100362, + "balance_loss_mlp": 1.084126, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.07319281645054936, + "language_loss": 0.77725756, + "learning_rate": 0.00036643251584886333, + "loss": 0.78826118, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.16235352, + "step": 3111, + "time_per_iteration": 2.795421838760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100904, + "balance_loss_mlp": 1.08444118, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.07234799336755846, + "language_loss": 0.8192088, + "learning_rate": 0.00036613232120630393, + "loss": 0.83021784, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.16467285, + "step": 3112, + "time_per_iteration": 2.6119191646575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07855165, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.1220679262263155, + "language_loss": 0.7997117, + "learning_rate": 0.00036583217853578643, + "loss": 0.81066352, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.16638184, + "step": 3113, + "time_per_iteration": 2.5559191703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095031, + "balance_loss_mlp": 1.07856846, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.06954821000435275, + "language_loss": 0.77413309, + "learning_rate": 0.000365532087953837, + "loss": 0.78508341, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.16467285, + "step": 3114, + "time_per_iteration": 3.6444194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093034, + "balance_loss_mlp": 1.07666647, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.07355388338928669, + "language_loss": 0.89153886, + "learning_rate": 0.00036523204957696065, + "loss": 0.90246928, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.16369629, + "step": 3115, + "time_per_iteration": 2.6114542484283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090631, + "balance_loss_mlp": 1.07385826, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.06661163617003031, + "language_loss": 0.80990088, + "learning_rate": 0.00036493206352164324, + "loss": 0.82080722, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.16784668, + "step": 3116, + "time_per_iteration": 2.977773666381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099589, + "balance_loss_mlp": 1.08299482, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.06605770678363264, + "language_loss": 0.85320091, + "learning_rate": 0.000364632129904349, + "loss": 0.86419678, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.16601562, + "step": 3117, + "time_per_iteration": 2.7504782676696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109875, + "balance_loss_mlp": 1.08246565, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.07896925435607946, + "language_loss": 0.78125691, + "learning_rate": 0.00036433224884152283, + "loss": 0.79224437, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.1628418, + "step": 3118, + "time_per_iteration": 2.762640953063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106506, + "balance_loss_mlp": 1.09019828, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.08654027448722386, + "language_loss": 0.77639025, + "learning_rate": 0.00036403242044958875, + "loss": 0.78745532, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.16308594, + "step": 3119, + "time_per_iteration": 2.590341567993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105329, + "balance_loss_mlp": 1.08873463, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.12490963722323402, + "language_loss": 0.91469646, + "learning_rate": 0.0003637326448449507, + "loss": 0.92574978, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.16601562, + "step": 3120, + "time_per_iteration": 2.757040500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114298, + "balance_loss_mlp": 1.09782338, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.07048281834234121, + "language_loss": 0.85906887, + "learning_rate": 0.00036343292214399177, + "loss": 0.87021184, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.16479492, + "step": 3121, + "time_per_iteration": 2.7731616497039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110866, + "balance_loss_mlp": 1.09368825, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.08856935015061373, + "language_loss": 0.77217454, + "learning_rate": 0.00036313325246307456, + "loss": 0.78328323, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.171875, + "step": 3122, + "time_per_iteration": 2.8254263401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107144, + "balance_loss_mlp": 1.0897516, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.07082824318872671, + "language_loss": 0.87116647, + "learning_rate": 0.0003628336359185411, + "loss": 0.88223791, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.17419434, + "step": 3123, + "time_per_iteration": 2.6960785388946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104705, + "balance_loss_mlp": 1.08815873, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.09352377906746982, + "language_loss": 0.75570095, + "learning_rate": 0.000362534072626713, + "loss": 0.76674795, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.16552734, + "step": 3124, + "time_per_iteration": 2.7963545322418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094202, + "balance_loss_mlp": 1.07738113, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.08561674190647896, + "language_loss": 0.81475127, + "learning_rate": 0.00036223456270389093, + "loss": 0.82569331, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.16833496, + "step": 3125, + "time_per_iteration": 2.992478609085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085838, + "balance_loss_mlp": 1.06857657, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.08087477259987003, + "language_loss": 0.80765188, + "learning_rate": 0.00036193510626635517, + "loss": 0.81851029, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.17272949, + "step": 3126, + "time_per_iteration": 2.6718900203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.05972588, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.08853778728712877, + "language_loss": 0.81355464, + "learning_rate": 0.0003616357034303649, + "loss": 0.82432842, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.17663574, + "step": 3127, + "time_per_iteration": 2.9547274112701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075422, + "balance_loss_mlp": 1.05762434, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.1711605115844366, + "language_loss": 0.78441834, + "learning_rate": 0.0003613363543121584, + "loss": 0.79517257, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.17810059, + "step": 3128, + "time_per_iteration": 2.886970281600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_mlp": 1.04813766, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.08758734410380958, + "language_loss": 0.85043442, + "learning_rate": 0.00036103705902795357, + "loss": 0.86108834, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.17260742, + "step": 3129, + "time_per_iteration": 2.748079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072993, + "balance_loss_mlp": 1.0555644, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.09694707916442274, + "language_loss": 0.7971251, + "learning_rate": 0.0003607378176939471, + "loss": 0.80785501, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.17443848, + "step": 3130, + "time_per_iteration": 2.6402640342712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069092, + "balance_loss_mlp": 1.05256987, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.08416157217627585, + "language_loss": 0.82138842, + "learning_rate": 0.00036043863042631465, + "loss": 0.83207935, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.1652832, + "step": 3131, + "time_per_iteration": 2.679304838180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069317, + "balance_loss_mlp": 1.05229378, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.08544531393878185, + "language_loss": 0.76554382, + "learning_rate": 0.00036013949734121133, + "loss": 0.77623701, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.17028809, + "step": 3132, + "time_per_iteration": 3.1334645748138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071138, + "balance_loss_mlp": 1.05466342, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.08104461370045753, + "language_loss": 0.82059807, + "learning_rate": 0.00035984041855477043, + "loss": 0.8313095, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.16467285, + "step": 3133, + "time_per_iteration": 2.7347941398620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045627, + "balance_loss_mlp": 1.03842688, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.025003389778794672, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79755521, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.07177734, + "step": 3134, + "time_per_iteration": 4.970470428466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076232, + "balance_loss_mlp": 1.05934048, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.07365504722099776, + "language_loss": 0.79866755, + "learning_rate": 0.00035924242434230637, + "loss": 0.80942982, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.16906738, + "step": 3135, + "time_per_iteration": 2.7135050296783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.06296587, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.08294049229736823, + "language_loss": 0.78440452, + "learning_rate": 0.00035894350914844516, + "loss": 0.79520017, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.16601562, + "step": 3136, + "time_per_iteration": 2.6597416400909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.06325424, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.08267470686196479, + "language_loss": 0.83196414, + "learning_rate": 0.0003586446487175703, + "loss": 0.84276295, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.16638184, + "step": 3137, + "time_per_iteration": 2.7022488117218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084641, + "balance_loss_mlp": 1.06798732, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.064575038850489, + "language_loss": 0.85214019, + "learning_rate": 0.0003583458431657099, + "loss": 0.86298662, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.16662598, + "step": 3138, + "time_per_iteration": 2.7720208168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084009, + "balance_loss_mlp": 1.06771302, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.09877124262847642, + "language_loss": 0.82838678, + "learning_rate": 0.00035804709260887056, + "loss": 0.83922684, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.16296387, + "step": 3139, + "time_per_iteration": 2.6879312992095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086877, + "balance_loss_mlp": 1.07065237, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.07215366111763855, + "language_loss": 0.8912158, + "learning_rate": 0.0003577483971630373, + "loss": 0.90208459, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.16223145, + "step": 3140, + "time_per_iteration": 2.734809398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085015, + "balance_loss_mlp": 1.06892204, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.05656780869347305, + "language_loss": 0.84707594, + "learning_rate": 0.00035744975694417414, + "loss": 0.85792601, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.16088867, + "step": 3141, + "time_per_iteration": 2.8830533027648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_mlp": 1.06837583, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.12103965495464937, + "language_loss": 0.82471883, + "learning_rate": 0.00035715117206822344, + "loss": 0.83555734, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.15454102, + "step": 3142, + "time_per_iteration": 2.838871479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085944, + "balance_loss_mlp": 1.06989884, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.07532409559899438, + "language_loss": 0.80957747, + "learning_rate": 0.0003568526426511065, + "loss": 0.82043689, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.16040039, + "step": 3143, + "time_per_iteration": 2.646676540374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.07312953, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.09699368707048923, + "language_loss": 0.82747424, + "learning_rate": 0.000356554168808722, + "loss": 0.83836174, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.15612793, + "step": 3144, + "time_per_iteration": 2.9851598739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093552, + "balance_loss_mlp": 1.07773244, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.07251607714921615, + "language_loss": 0.84944451, + "learning_rate": 0.00035625575065694837, + "loss": 0.86037999, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.15808105, + "step": 3145, + "time_per_iteration": 2.8598599433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090889, + "balance_loss_mlp": 1.07443857, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.07064458078135354, + "language_loss": 0.77895433, + "learning_rate": 0.0003559573883116415, + "loss": 0.78986323, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.16455078, + "step": 3146, + "time_per_iteration": 2.733262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089343, + "balance_loss_mlp": 1.07359576, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.07444440196123078, + "language_loss": 0.85480058, + "learning_rate": 0.00035565908188863604, + "loss": 0.86569399, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.15734863, + "step": 3147, + "time_per_iteration": 2.853851079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091792, + "balance_loss_mlp": 1.07599723, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.06196180807513896, + "language_loss": 0.79582435, + "learning_rate": 0.00035536083150374464, + "loss": 0.80674225, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.15783691, + "step": 3148, + "time_per_iteration": 2.776559352874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_mlp": 1.03995907, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.024337037001299088, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75795162, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.07226562, + "step": 3149, + "time_per_iteration": 4.840685129165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091535, + "balance_loss_mlp": 1.07552564, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.06209204496769419, + "language_loss": 0.85722816, + "learning_rate": 0.0003547644993114475, + "loss": 0.8681435, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.16003418, + "step": 3150, + "time_per_iteration": 2.8153529167175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092959, + "balance_loss_mlp": 1.07712793, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.07176933512118068, + "language_loss": 0.79877794, + "learning_rate": 0.00035446641773555806, + "loss": 0.80970764, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.15820312, + "step": 3151, + "time_per_iteration": 2.757474184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094094, + "balance_loss_mlp": 1.0779295, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.10666232173403664, + "language_loss": 0.86817247, + "learning_rate": 0.000354168392660816, + "loss": 0.87911344, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.16162109, + "step": 3152, + "time_per_iteration": 2.7577521800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093009, + "balance_loss_mlp": 1.07742882, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.06835832262029293, + "language_loss": 0.82626665, + "learning_rate": 0.0003538704242029252, + "loss": 0.83719671, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.15576172, + "step": 3153, + "time_per_iteration": 2.7824299335479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096364, + "balance_loss_mlp": 1.08066463, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.07699381631687732, + "language_loss": 0.77828813, + "learning_rate": 0.0003535725124775672, + "loss": 0.7892518, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.15686035, + "step": 3154, + "time_per_iteration": 2.8603780269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101879, + "balance_loss_mlp": 1.085392, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.06603606941894191, + "language_loss": 0.86388272, + "learning_rate": 0.00035327465760040126, + "loss": 0.87490153, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.16491699, + "step": 3155, + "time_per_iteration": 2.731767177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102466, + "balance_loss_mlp": 1.08700442, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.08742295718167487, + "language_loss": 0.84376252, + "learning_rate": 0.00035297685968706526, + "loss": 0.85478723, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.15441895, + "step": 3156, + "time_per_iteration": 2.7879996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099222, + "balance_loss_mlp": 1.08361709, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.07206801524938761, + "language_loss": 0.82717532, + "learning_rate": 0.00035267911885317454, + "loss": 0.83816749, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.15588379, + "step": 3157, + "time_per_iteration": 2.6752853393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096892, + "balance_loss_mlp": 1.08108473, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.06913859395071588, + "language_loss": 0.81624317, + "learning_rate": 0.0003523814352143222, + "loss": 0.8272121, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.15795898, + "step": 3158, + "time_per_iteration": 2.851680040359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096859, + "balance_loss_mlp": 1.08079004, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.07191756501085539, + "language_loss": 0.90879536, + "learning_rate": 0.00035208380888607937, + "loss": 0.91976392, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.16064453, + "step": 3159, + "time_per_iteration": 2.8229289054870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_mlp": 1.02311516, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.017458667771122316, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80492157, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.07080078, + "step": 3160, + "time_per_iteration": 4.860463619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_mlp": 1.01992178, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.015423076795417967, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76719207, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.07080078, + "step": 3161, + "time_per_iteration": 5.027961254119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090745, + "balance_loss_mlp": 1.07459164, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.06716496050507109, + "language_loss": 0.81388539, + "learning_rate": 0.00035119127492038446, + "loss": 0.82479286, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.16149902, + "step": 3162, + "time_per_iteration": 2.8567075729370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_mlp": 1.07425177, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.07519938175586753, + "language_loss": 0.82571161, + "learning_rate": 0.00035089387898984436, + "loss": 0.83661485, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.16064453, + "step": 3163, + "time_per_iteration": 3.0894179344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093705, + "balance_loss_mlp": 1.07734919, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.07531226352360243, + "language_loss": 0.81800103, + "learning_rate": 0.0003505965409474343, + "loss": 0.82893807, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.16357422, + "step": 3164, + "time_per_iteration": 2.909203290939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088443, + "balance_loss_mlp": 1.07221854, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.06350426788679164, + "language_loss": 0.86488736, + "learning_rate": 0.0003502992609085913, + "loss": 0.87577182, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.16223145, + "step": 3165, + "time_per_iteration": 2.6909096240997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087782, + "balance_loss_mlp": 1.07146227, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.0979130476844587, + "language_loss": 0.82205462, + "learning_rate": 0.00035000203898872954, + "loss": 0.83293247, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.16320801, + "step": 3166, + "time_per_iteration": 3.0287840366363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092824, + "balance_loss_mlp": 1.07664716, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.10375532619284132, + "language_loss": 0.84533244, + "learning_rate": 0.0003497048753032406, + "loss": 0.85626066, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.16174316, + "step": 3167, + "time_per_iteration": 2.8883583545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092408, + "balance_loss_mlp": 1.07648182, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.06471277204040406, + "language_loss": 0.80592054, + "learning_rate": 0.000349407769967494, + "loss": 0.81684464, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.15917969, + "step": 3168, + "time_per_iteration": 3.386155605316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099872, + "balance_loss_mlp": 1.08381498, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.11400005862882004, + "language_loss": 0.84987879, + "learning_rate": 0.0003491107230968361, + "loss": 0.86087751, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.16052246, + "step": 3169, + "time_per_iteration": 2.6899755001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096804, + "balance_loss_mlp": 1.08061576, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.06652355990642472, + "language_loss": 0.81221354, + "learning_rate": 0.00034881373480659085, + "loss": 0.82318163, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.16186523, + "step": 3170, + "time_per_iteration": 2.8547778129577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101156, + "balance_loss_mlp": 1.08508694, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.08688268797683278, + "language_loss": 0.77884257, + "learning_rate": 0.0003485168052120594, + "loss": 0.78985405, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.16064453, + "step": 3171, + "time_per_iteration": 2.6543068885803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110044, + "balance_loss_mlp": 1.08477592, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.09027989422234346, + "language_loss": 0.79380625, + "learning_rate": 0.00034821993442851973, + "loss": 0.80481064, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.15649414, + "step": 3172, + "time_per_iteration": 2.6117188930511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100941, + "balance_loss_mlp": 1.08522928, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.1005367012587997, + "language_loss": 0.82141685, + "learning_rate": 0.00034792312257122735, + "loss": 0.83242625, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.15698242, + "step": 3173, + "time_per_iteration": 2.634824752807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107115, + "balance_loss_mlp": 1.09201097, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.07806982240241292, + "language_loss": 0.80516702, + "learning_rate": 0.00034762636975541506, + "loss": 0.81623822, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.15087891, + "step": 3174, + "time_per_iteration": 2.7511277198791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111247, + "balance_loss_mlp": 1.09719944, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.09012937190678837, + "language_loss": 0.80371904, + "learning_rate": 0.0003473296760962923, + "loss": 0.81484377, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.15246582, + "step": 3175, + "time_per_iteration": 2.7333414554595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105343, + "balance_loss_mlp": 1.04603887, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.017873347223140334, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79587168, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.07373047, + "step": 3176, + "time_per_iteration": 4.656734943389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112456, + "balance_loss_mlp": 1.10965848, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.07170779608360676, + "language_loss": 0.81361848, + "learning_rate": 0.00034673646670883976, + "loss": 0.82486403, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.14892578, + "step": 3177, + "time_per_iteration": 2.9838032722473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_mlp": 1.04572225, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.018001303469989904, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76768184, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.07421875, + "step": 3178, + "time_per_iteration": 4.987392425537109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130172, + "balance_loss_mlp": 1.11506796, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.0710561364168879, + "language_loss": 0.82215559, + "learning_rate": 0.0003461434953300865, + "loss": 0.83345723, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.15075684, + "step": 3179, + "time_per_iteration": 2.972102165222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129428, + "balance_loss_mlp": 1.11437213, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.06625806695927375, + "language_loss": 0.81118929, + "learning_rate": 0.0003458470991817515, + "loss": 0.82248354, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.15039062, + "step": 3180, + "time_per_iteration": 2.9920318126678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.12371635, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.09554430463950304, + "language_loss": 0.84819943, + "learning_rate": 0.0003455507628808802, + "loss": 0.8595888, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.15197754, + "step": 3181, + "time_per_iteration": 2.620678424835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138152, + "balance_loss_mlp": 1.12281001, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.07764809477009631, + "language_loss": 0.84588206, + "learning_rate": 0.00034525448654252076, + "loss": 0.85726357, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.15319824, + "step": 3182, + "time_per_iteration": 2.662243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132309, + "balance_loss_mlp": 1.11651397, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.08919622612772353, + "language_loss": 0.8301183, + "learning_rate": 0.0003449582702816976, + "loss": 0.84144139, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.15783691, + "step": 3183, + "time_per_iteration": 2.696509599685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131366, + "balance_loss_mlp": 1.11577392, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.07246136408920362, + "language_loss": 0.82839715, + "learning_rate": 0.0003446621142134122, + "loss": 0.83971083, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.15576172, + "step": 3184, + "time_per_iteration": 2.6876282691955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129534, + "balance_loss_mlp": 1.1142869, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.10207734274681185, + "language_loss": 0.84166813, + "learning_rate": 0.0003443660184526424, + "loss": 0.85296345, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.15222168, + "step": 3185, + "time_per_iteration": 2.457191228866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126874, + "balance_loss_mlp": 1.11150801, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.08690649590486366, + "language_loss": 0.86419243, + "learning_rate": 0.0003440699831143429, + "loss": 0.8754611, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.15356445, + "step": 3186, + "time_per_iteration": 2.7862656116485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117106, + "balance_loss_mlp": 1.10134614, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.09433598630753232, + "language_loss": 0.82150078, + "learning_rate": 0.0003437740083134449, + "loss": 0.83267176, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.1574707, + "step": 3187, + "time_per_iteration": 2.732182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110686, + "balance_loss_mlp": 1.09489119, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.107565485764287, + "language_loss": 0.83600903, + "learning_rate": 0.00034347809416485574, + "loss": 0.84711587, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.15783691, + "step": 3188, + "time_per_iteration": 2.5941028594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.09327221, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.07306418964956934, + "language_loss": 0.81643283, + "learning_rate": 0.0003431822407834597, + "loss": 0.82752192, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.15625, + "step": 3189, + "time_per_iteration": 2.79345440864563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107509, + "balance_loss_mlp": 1.09151149, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.07663580973151435, + "language_loss": 0.83989727, + "learning_rate": 0.00034288644828411706, + "loss": 0.85097235, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.15991211, + "step": 3190, + "time_per_iteration": 3.495431423187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107683, + "balance_loss_mlp": 1.0914706, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.09805760174561111, + "language_loss": 0.75479543, + "learning_rate": 0.0003425907167816649, + "loss": 0.76587236, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.16210938, + "step": 3191, + "time_per_iteration": 2.890688896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100168, + "balance_loss_mlp": 1.0839076, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.08119558149243, + "language_loss": 0.84596795, + "learning_rate": 0.00034229504639091623, + "loss": 0.85696959, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.16259766, + "step": 3192, + "time_per_iteration": 2.799213171005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110546, + "balance_loss_mlp": 1.08940232, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.13197057459029027, + "language_loss": 0.79937923, + "learning_rate": 0.0003419994372266606, + "loss": 0.81043386, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.16052246, + "step": 3193, + "time_per_iteration": 3.1180262565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103547, + "balance_loss_mlp": 1.08715582, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.07478792325095046, + "language_loss": 0.81555808, + "learning_rate": 0.00034170388940366335, + "loss": 0.82659352, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.16381836, + "step": 3194, + "time_per_iteration": 2.7108078002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.08935523, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.1666581336707107, + "language_loss": 0.80146444, + "learning_rate": 0.0003414084030366667, + "loss": 0.81251997, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.16210938, + "step": 3195, + "time_per_iteration": 3.146375894546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098435, + "balance_loss_mlp": 1.08159113, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.07855669714866301, + "language_loss": 0.82993454, + "learning_rate": 0.0003411129782403883, + "loss": 0.8409189, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.1685791, + "step": 3196, + "time_per_iteration": 2.6907546520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102656, + "balance_loss_mlp": 1.0864203, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.08662161159961286, + "language_loss": 0.84978783, + "learning_rate": 0.0003408176151295225, + "loss": 0.86081439, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.16235352, + "step": 3197, + "time_per_iteration": 2.7353785037994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098614, + "balance_loss_mlp": 1.08207965, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.11963983083590954, + "language_loss": 0.77372497, + "learning_rate": 0.00034052231381873944, + "loss": 0.78471112, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.16540527, + "step": 3198, + "time_per_iteration": 2.673388957977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097817, + "balance_loss_mlp": 1.08129418, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.07877091537638886, + "language_loss": 0.84876865, + "learning_rate": 0.00034022707442268494, + "loss": 0.85974681, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.1652832, + "step": 3199, + "time_per_iteration": 2.5626182556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090925, + "balance_loss_mlp": 1.07454538, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.07568498479176501, + "language_loss": 0.81815386, + "learning_rate": 0.0003399318970559813, + "loss": 0.82906306, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.16381836, + "step": 3200, + "time_per_iteration": 2.829237461090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108966, + "balance_loss_mlp": 1.07353139, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.2497942099132976, + "language_loss": 0.8433665, + "learning_rate": 0.00033963678183322656, + "loss": 0.85426307, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.16125488, + "step": 3201, + "time_per_iteration": 3.1063387393951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087739, + "balance_loss_mlp": 1.07162154, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.06940460952874025, + "language_loss": 0.82539898, + "learning_rate": 0.0003393417288689945, + "loss": 0.83627635, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.16113281, + "step": 3202, + "time_per_iteration": 2.7065072059631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093588, + "balance_loss_mlp": 1.0775063, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.08060008317875632, + "language_loss": 0.75810564, + "learning_rate": 0.00033904673827783504, + "loss": 0.76904154, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.1607666, + "step": 3203, + "time_per_iteration": 2.976076364517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091051, + "balance_loss_mlp": 1.07505345, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.05609765928721304, + "language_loss": 0.81290334, + "learning_rate": 0.00033875181017427357, + "loss": 0.8238138, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.15991211, + "step": 3204, + "time_per_iteration": 2.617102861404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094322, + "balance_loss_mlp": 1.0783596, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.06962026765049416, + "language_loss": 0.80802751, + "learning_rate": 0.00033845694467281133, + "loss": 0.81897068, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.1595459, + "step": 3205, + "time_per_iteration": 2.9406063556671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100233, + "balance_loss_mlp": 1.08492684, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.08157941962089017, + "language_loss": 0.83428419, + "learning_rate": 0.00033816214188792516, + "loss": 0.84528655, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.1529541, + "step": 3206, + "time_per_iteration": 3.1819798946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097691, + "balance_loss_mlp": 1.08228946, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.0725317157216798, + "language_loss": 0.85080433, + "learning_rate": 0.00033786740193406784, + "loss": 0.86178124, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.15380859, + "step": 3207, + "time_per_iteration": 2.5949695110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099512, + "balance_loss_mlp": 1.08397925, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.09100196338205928, + "language_loss": 0.81269908, + "learning_rate": 0.00033757272492566736, + "loss": 0.82369423, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.15515137, + "step": 3208, + "time_per_iteration": 2.896113157272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101736, + "balance_loss_mlp": 1.08576202, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.061084762656912546, + "language_loss": 0.86857277, + "learning_rate": 0.0003372781109771278, + "loss": 0.87959015, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.15966797, + "step": 3209, + "time_per_iteration": 2.7744648456573486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098286, + "balance_loss_mlp": 1.08235943, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.0666635733945454, + "language_loss": 0.7634722, + "learning_rate": 0.0003369835602028281, + "loss": 0.77445507, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.15917969, + "step": 3210, + "time_per_iteration": 2.807690143585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109414, + "balance_loss_mlp": 1.07845259, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.06505304980204422, + "language_loss": 0.79307866, + "learning_rate": 0.0003366890727171232, + "loss": 0.80402005, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.15673828, + "step": 3211, + "time_per_iteration": 2.6847074031829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092887, + "balance_loss_mlp": 1.07701993, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.08815950120803863, + "language_loss": 0.78273273, + "learning_rate": 0.00033639464863434313, + "loss": 0.79366159, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.15856934, + "step": 3212, + "time_per_iteration": 2.6401009559631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_mlp": 1.0277102, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.026269033760010364, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79478669, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.08496094, + "step": 3213, + "time_per_iteration": 4.715362787246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.07037783, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.0738307593479646, + "language_loss": 0.79866982, + "learning_rate": 0.00033580599113475543, + "loss": 0.80953264, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.15893555, + "step": 3214, + "time_per_iteration": 3.000586986541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085789, + "balance_loss_mlp": 1.06937385, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.07082068470291375, + "language_loss": 0.86112303, + "learning_rate": 0.00033551175794648507, + "loss": 0.87198091, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.16418457, + "step": 3215, + "time_per_iteration": 2.494271755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090218, + "balance_loss_mlp": 1.0744828, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.12386747006326235, + "language_loss": 0.81595516, + "learning_rate": 0.00033521758861821365, + "loss": 0.82685733, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.15722656, + "step": 3216, + "time_per_iteration": 2.646888256072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084021, + "balance_loss_mlp": 1.06845176, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.07895450419788622, + "language_loss": 0.88963878, + "learning_rate": 0.0003349234832641479, + "loss": 0.90047896, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.15551758, + "step": 3217, + "time_per_iteration": 2.603308916091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_mlp": 1.06719124, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.07412246330535043, + "language_loss": 0.808752, + "learning_rate": 0.00033462944199846975, + "loss": 0.81957746, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.15332031, + "step": 3218, + "time_per_iteration": 3.086716413497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083407, + "balance_loss_mlp": 1.06795752, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07145505501141985, + "language_loss": 0.86298114, + "learning_rate": 0.00033433546493533606, + "loss": 0.87381524, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.15429688, + "step": 3219, + "time_per_iteration": 2.525264024734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079767, + "balance_loss_mlp": 1.06349516, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.086291171152169, + "language_loss": 0.83994114, + "learning_rate": 0.00033404155218887897, + "loss": 0.85073888, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.16271973, + "step": 3220, + "time_per_iteration": 2.7530763149261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080633, + "balance_loss_mlp": 1.06478977, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.11530682173053017, + "language_loss": 0.87328637, + "learning_rate": 0.00033374770387320534, + "loss": 0.88409269, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.1583252, + "step": 3221, + "time_per_iteration": 2.769804000854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107728, + "balance_loss_mlp": 1.06110358, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.09653805931546991, + "language_loss": 0.84981918, + "learning_rate": 0.00033345392010239737, + "loss": 0.86059201, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.16174316, + "step": 3222, + "time_per_iteration": 2.742431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080173, + "balance_loss_mlp": 1.0643059, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.08405780593634497, + "language_loss": 0.82221037, + "learning_rate": 0.0003331602009905118, + "loss": 0.8330121, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.15856934, + "step": 3223, + "time_per_iteration": 2.8276350498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075434, + "balance_loss_mlp": 1.05924559, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.16424334065153295, + "language_loss": 0.83946419, + "learning_rate": 0.00033286654665158085, + "loss": 0.85021853, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.16186523, + "step": 3224, + "time_per_iteration": 3.0171141624450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074402, + "balance_loss_mlp": 1.05797529, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.07119512834175158, + "language_loss": 0.8751117, + "learning_rate": 0.0003325729571996109, + "loss": 0.88585573, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.16430664, + "step": 3225, + "time_per_iteration": 2.6336770057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107489, + "balance_loss_mlp": 1.05822468, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.07015160541541936, + "language_loss": 0.83497381, + "learning_rate": 0.000332279432748584, + "loss": 0.84572268, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.16674805, + "step": 3226, + "time_per_iteration": 2.8068268299102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077491, + "balance_loss_mlp": 1.06129014, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.08244551299177609, + "language_loss": 0.87847024, + "learning_rate": 0.00033198597341245576, + "loss": 0.88924515, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.1619873, + "step": 3227, + "time_per_iteration": 2.6014742851257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070414, + "balance_loss_mlp": 1.05366468, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.25336628226947533, + "language_loss": 0.82029134, + "learning_rate": 0.00033169257930515763, + "loss": 0.83099544, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.16760254, + "step": 3228, + "time_per_iteration": 3.086378335952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080291, + "balance_loss_mlp": 1.06385183, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.06847993393240591, + "language_loss": 0.81926602, + "learning_rate": 0.0003313992505405951, + "loss": 0.83006895, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.16442871, + "step": 3229, + "time_per_iteration": 2.721404552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108161, + "balance_loss_mlp": 1.06523085, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.08774924487902723, + "language_loss": 0.81243527, + "learning_rate": 0.0003311059872326487, + "loss": 0.82325131, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.16381836, + "step": 3230, + "time_per_iteration": 2.698370933532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.07283747, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.06270851897860089, + "language_loss": 0.79239869, + "learning_rate": 0.0003308127894951734, + "loss": 0.80328983, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.16271973, + "step": 3231, + "time_per_iteration": 2.642587900161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103422, + "balance_loss_mlp": 1.08775783, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.08661735945453952, + "language_loss": 0.86286879, + "learning_rate": 0.00033051965744199834, + "loss": 0.87390304, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.15649414, + "step": 3232, + "time_per_iteration": 2.7654480934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110456, + "balance_loss_mlp": 1.08876467, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.08070984322149112, + "language_loss": 0.90182227, + "learning_rate": 0.0003302265911869276, + "loss": 0.91286784, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.15795898, + "step": 3233, + "time_per_iteration": 2.973137378692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102412, + "balance_loss_mlp": 1.0863899, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.10903375315804033, + "language_loss": 0.83981085, + "learning_rate": 0.0003299335908437397, + "loss": 0.85083497, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.16015625, + "step": 3234, + "time_per_iteration": 2.6683669090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110257, + "balance_loss_mlp": 1.08685815, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08931018897921299, + "language_loss": 0.79380894, + "learning_rate": 0.0003296406565261873, + "loss": 0.8048346, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.15698242, + "step": 3235, + "time_per_iteration": 2.4825046062469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093994, + "balance_loss_mlp": 1.07830596, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.08356203677031868, + "language_loss": 0.84839869, + "learning_rate": 0.0003293477883479978, + "loss": 0.85933864, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.15673828, + "step": 3236, + "time_per_iteration": 2.855417013168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.08046174, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.0752906084942527, + "language_loss": 0.79873055, + "learning_rate": 0.0003290549864228727, + "loss": 0.80969167, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.15637207, + "step": 3237, + "time_per_iteration": 2.954319953918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094914, + "balance_loss_mlp": 1.07898724, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.0798274919474459, + "language_loss": 0.86145848, + "learning_rate": 0.0003287622508644875, + "loss": 0.87240762, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.15917969, + "step": 3238, + "time_per_iteration": 2.7834508419036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095092, + "balance_loss_mlp": 1.07920146, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.08228635643627878, + "language_loss": 0.86427939, + "learning_rate": 0.0003284695817864923, + "loss": 0.87523031, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.15881348, + "step": 3239, + "time_per_iteration": 2.52299427986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089491, + "balance_loss_mlp": 1.07388628, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.07912840789320032, + "language_loss": 0.83886796, + "learning_rate": 0.0003281769793025116, + "loss": 0.84976286, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.15588379, + "step": 3240, + "time_per_iteration": 2.736513614654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090298, + "balance_loss_mlp": 1.07525432, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.08036892690919402, + "language_loss": 0.89556086, + "learning_rate": 0.00032788444352614346, + "loss": 0.90646392, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.15014648, + "step": 3241, + "time_per_iteration": 2.532486915588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091662, + "balance_loss_mlp": 1.07645059, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.10748346186941515, + "language_loss": 0.80754519, + "learning_rate": 0.0003275919745709606, + "loss": 0.81846178, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.15197754, + "step": 3242, + "time_per_iteration": 2.6164467334747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093762, + "balance_loss_mlp": 1.07853913, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.07410139780614007, + "language_loss": 0.82327247, + "learning_rate": 0.00032729957255050936, + "loss": 0.83421004, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.15197754, + "step": 3243, + "time_per_iteration": 2.711912155151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094763, + "balance_loss_mlp": 1.07895613, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.07913543428232035, + "language_loss": 0.81355995, + "learning_rate": 0.0003270072375783102, + "loss": 0.82450759, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.15795898, + "step": 3244, + "time_per_iteration": 2.896878242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091166, + "balance_loss_mlp": 1.07548952, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.10691714389102631, + "language_loss": 0.79955053, + "learning_rate": 0.00032671496976785774, + "loss": 0.81046224, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.15661621, + "step": 3245, + "time_per_iteration": 2.6352155208587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089637, + "balance_loss_mlp": 1.07429433, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06870861562769151, + "language_loss": 0.75493729, + "learning_rate": 0.0003264227692326205, + "loss": 0.76583362, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.15319824, + "step": 3246, + "time_per_iteration": 3.093111991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098568, + "balance_loss_mlp": 1.08292735, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.06424326039406808, + "language_loss": 0.85849744, + "learning_rate": 0.00032613063608604055, + "loss": 0.86948311, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.15625, + "step": 3247, + "time_per_iteration": 2.5499489307403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109153, + "balance_loss_mlp": 1.07599711, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.07629898718313471, + "language_loss": 0.83584791, + "learning_rate": 0.0003258385704415343, + "loss": 0.84676319, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.15515137, + "step": 3248, + "time_per_iteration": 2.6027162075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098413, + "balance_loss_mlp": 1.08287978, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.08365862742240879, + "language_loss": 0.83149463, + "learning_rate": 0.0003255465724124915, + "loss": 0.84247875, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.15515137, + "step": 3249, + "time_per_iteration": 2.730041742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104065, + "balance_loss_mlp": 1.08905637, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.06996210477337128, + "language_loss": 0.82732821, + "learning_rate": 0.00032525464211227587, + "loss": 0.83836889, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.14990234, + "step": 3250, + "time_per_iteration": 2.610226631164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103584, + "balance_loss_mlp": 1.08822954, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.07802302552021714, + "language_loss": 0.85721552, + "learning_rate": 0.0003249627796542249, + "loss": 0.86825138, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.15344238, + "step": 3251, + "time_per_iteration": 2.6803338527679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096481, + "balance_loss_mlp": 1.08087671, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06796886597931054, + "language_loss": 0.84280014, + "learning_rate": 0.00032467098515164943, + "loss": 0.85376501, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.15588379, + "step": 3252, + "time_per_iteration": 2.904672861099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111129, + "balance_loss_mlp": 1.0956316, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.09344441617703737, + "language_loss": 0.84051675, + "learning_rate": 0.00032437925871783456, + "loss": 0.85162807, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.15490723, + "step": 3253, + "time_per_iteration": 2.704474925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_mlp": 1.08483791, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.07749015001842677, + "language_loss": 0.84249985, + "learning_rate": 0.00032408760046603803, + "loss": 0.85351181, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.16357422, + "step": 3254, + "time_per_iteration": 2.849126100540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103151, + "balance_loss_mlp": 1.08711767, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.06356173673048542, + "language_loss": 0.77591729, + "learning_rate": 0.00032379601050949193, + "loss": 0.7869488, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.16027832, + "step": 3255, + "time_per_iteration": 3.119446039199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091759, + "balance_loss_mlp": 1.07567763, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.07099798936628814, + "language_loss": 0.88052809, + "learning_rate": 0.0003235044889614013, + "loss": 0.8914457, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.1607666, + "step": 3256, + "time_per_iteration": 2.613060235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094317, + "balance_loss_mlp": 1.07879567, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.09103285060776488, + "language_loss": 0.8368516, + "learning_rate": 0.0003232130359349451, + "loss": 0.84779477, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.1550293, + "step": 3257, + "time_per_iteration": 2.8671774864196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089032, + "balance_loss_mlp": 1.07287872, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.0836607688375681, + "language_loss": 0.81645948, + "learning_rate": 0.0003229216515432751, + "loss": 0.82734984, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.16149902, + "step": 3258, + "time_per_iteration": 2.8217055797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093741, + "balance_loss_mlp": 1.0781126, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.07437080519931394, + "language_loss": 0.79591352, + "learning_rate": 0.0003226303358995174, + "loss": 0.80685091, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.15612793, + "step": 3259, + "time_per_iteration": 2.613922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109242, + "balance_loss_mlp": 1.07588553, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.06263163093589014, + "language_loss": 0.88819879, + "learning_rate": 0.00032233908911677, + "loss": 0.89912301, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.16540527, + "step": 3260, + "time_per_iteration": 2.855600118637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109092, + "balance_loss_mlp": 1.07450485, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.06460016363514721, + "language_loss": 0.80802065, + "learning_rate": 0.0003220479113081053, + "loss": 0.81892991, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.16418457, + "step": 3261, + "time_per_iteration": 2.753509759902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_mlp": 1.06910312, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.074937478592973, + "language_loss": 0.79032731, + "learning_rate": 0.00032175680258656836, + "loss": 0.80117977, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.16137695, + "step": 3262, + "time_per_iteration": 2.7336065769195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.06954229, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.06015193391132931, + "language_loss": 0.79762304, + "learning_rate": 0.00032146576306517794, + "loss": 0.80847853, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.16003418, + "step": 3263, + "time_per_iteration": 2.8162710666656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087014, + "balance_loss_mlp": 1.070611, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.08732390262483163, + "language_loss": 0.80907923, + "learning_rate": 0.0003211747928569255, + "loss": 0.81994939, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.1640625, + "step": 3264, + "time_per_iteration": 2.7805709838867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087684, + "balance_loss_mlp": 1.07150757, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.06366142393715324, + "language_loss": 0.81574047, + "learning_rate": 0.0003208838920747754, + "loss": 0.82661736, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.16174316, + "step": 3265, + "time_per_iteration": 2.8634932041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07176387, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.06892871755232625, + "language_loss": 0.76471019, + "learning_rate": 0.0003205930608316656, + "loss": 0.77558672, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.15881348, + "step": 3266, + "time_per_iteration": 3.491633176803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088752, + "balance_loss_mlp": 1.07274199, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.07065676872193134, + "language_loss": 0.84763551, + "learning_rate": 0.00032030229924050673, + "loss": 0.85852307, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.16003418, + "step": 3267, + "time_per_iteration": 2.7322630882263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081627, + "balance_loss_mlp": 1.0655694, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.076810738762244, + "language_loss": 0.80159783, + "learning_rate": 0.00032001160741418247, + "loss": 0.81241405, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.16052246, + "step": 3268, + "time_per_iteration": 2.683931589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.06859946, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.07050633409019491, + "language_loss": 0.81839114, + "learning_rate": 0.0003197209854655494, + "loss": 0.82922828, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.15100098, + "step": 3269, + "time_per_iteration": 2.7007665634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.0728085, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.07859150018843152, + "language_loss": 0.74576277, + "learning_rate": 0.0003194304335074371, + "loss": 0.75664711, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.15625, + "step": 3270, + "time_per_iteration": 2.8443710803985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093344, + "balance_loss_mlp": 1.07737029, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.07641817393063347, + "language_loss": 0.88118923, + "learning_rate": 0.0003191399516526475, + "loss": 0.89212275, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.15966797, + "step": 3271, + "time_per_iteration": 2.510565996170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109192, + "balance_loss_mlp": 1.07666111, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.06496379597485666, + "language_loss": 0.79376519, + "learning_rate": 0.0003188495400139559, + "loss": 0.8046844, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.15234375, + "step": 3272, + "time_per_iteration": 2.8364667892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095867, + "balance_loss_mlp": 1.0803932, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.07122529047297946, + "language_loss": 0.8439455, + "learning_rate": 0.00031855919870411013, + "loss": 0.85490417, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.15466309, + "step": 3273, + "time_per_iteration": 2.8570995330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086291, + "balance_loss_mlp": 1.0712353, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.06914500829494513, + "language_loss": 0.84985608, + "learning_rate": 0.0003182689278358305, + "loss": 0.86071897, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.15039062, + "step": 3274, + "time_per_iteration": 2.757631301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108922, + "balance_loss_mlp": 1.07361603, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.07954775406848916, + "language_loss": 0.79536891, + "learning_rate": 0.0003179787275218105, + "loss": 0.80626118, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.15588379, + "step": 3275, + "time_per_iteration": 2.562164545059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083819, + "balance_loss_mlp": 1.06884634, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.08328401336331384, + "language_loss": 0.84322137, + "learning_rate": 0.0003176885978747155, + "loss": 0.85405958, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.14953613, + "step": 3276, + "time_per_iteration": 2.6230828762054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085747, + "balance_loss_mlp": 1.07017803, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.1699824723402015, + "language_loss": 0.82447994, + "learning_rate": 0.0003173985390071839, + "loss": 0.8353374, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.15551758, + "step": 3277, + "time_per_iteration": 2.913857936859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011052, + "balance_loss_mlp": 1.00342274, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.01180096248497286, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78911507, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.07617188, + "step": 3278, + "time_per_iteration": 4.810575008392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095394, + "balance_loss_mlp": 1.07975388, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.07584058368204265, + "language_loss": 0.81100649, + "learning_rate": 0.00031681863406122704, + "loss": 0.82196045, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.15625, + "step": 3279, + "time_per_iteration": 2.8176543712615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094938, + "balance_loss_mlp": 1.07984567, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.07145164235931235, + "language_loss": 0.85147798, + "learning_rate": 0.00031652878820794087, + "loss": 0.86242729, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.15063477, + "step": 3280, + "time_per_iteration": 3.010453462600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099528, + "balance_loss_mlp": 1.08434081, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.08537377503877883, + "language_loss": 0.85849619, + "learning_rate": 0.00031623901358449627, + "loss": 0.86949146, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.15161133, + "step": 3281, + "time_per_iteration": 2.6708781719207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101817, + "balance_loss_mlp": 1.08709431, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.05886068654642298, + "language_loss": 0.88589537, + "learning_rate": 0.0003159493103033936, + "loss": 0.89691359, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.14709473, + "step": 3282, + "time_per_iteration": 2.636570930480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023039, + "balance_loss_mlp": 1.01540971, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.014741970221396734, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80942094, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.07617188, + "step": 3283, + "time_per_iteration": 4.921837568283081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098019, + "balance_loss_mlp": 1.08298671, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.06611749936023467, + "language_loss": 0.82335258, + "learning_rate": 0.0003153701182180776, + "loss": 0.83433276, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.15014648, + "step": 3284, + "time_per_iteration": 2.804680824279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100744, + "balance_loss_mlp": 1.08583045, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.09468051023791588, + "language_loss": 0.81480467, + "learning_rate": 0.00031508062963872655, + "loss": 0.8258121, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.14892578, + "step": 3285, + "time_per_iteration": 2.618572950363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104547, + "balance_loss_mlp": 1.08974171, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.07285431421686336, + "language_loss": 0.79529119, + "learning_rate": 0.0003147912128514423, + "loss": 0.80633664, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.14794922, + "step": 3286, + "time_per_iteration": 2.7349414825439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112049, + "balance_loss_mlp": 1.0971601, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.07001944194285717, + "language_loss": 0.87457585, + "learning_rate": 0.0003145018679685859, + "loss": 0.88569629, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.14868164, + "step": 3287, + "time_per_iteration": 2.735057830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.09238625, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.06287056538994153, + "language_loss": 0.87662357, + "learning_rate": 0.00031421259510249134, + "loss": 0.88769281, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.1451416, + "step": 3288, + "time_per_iteration": 2.7864692211151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112538, + "balance_loss_mlp": 1.09816122, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.07989548298416052, + "language_loss": 0.80931014, + "learning_rate": 0.00031392339436546414, + "loss": 0.82043552, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.14355469, + "step": 3289, + "time_per_iteration": 2.8174936771392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110895, + "balance_loss_mlp": 1.09549332, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.0967935034115468, + "language_loss": 0.83535063, + "learning_rate": 0.00031363426586978205, + "loss": 0.84645951, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.15380859, + "step": 3290, + "time_per_iteration": 2.7781615257263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.09155595, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.07036168037167431, + "language_loss": 0.84420347, + "learning_rate": 0.0003133452097276947, + "loss": 0.8552683, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.14904785, + "step": 3291, + "time_per_iteration": 2.7578635215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098733, + "balance_loss_mlp": 1.08364153, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.07346038815510673, + "language_loss": 0.84298337, + "learning_rate": 0.0003130562260514238, + "loss": 0.85397065, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.15075684, + "step": 3292, + "time_per_iteration": 2.798175096511841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.07720733, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.07455275976827726, + "language_loss": 0.81438339, + "learning_rate": 0.0003127673149531626, + "loss": 0.8253122, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.15661621, + "step": 3293, + "time_per_iteration": 2.7655112743377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086095, + "balance_loss_mlp": 1.07095516, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.083592197063536, + "language_loss": 0.83216, + "learning_rate": 0.0003124784765450762, + "loss": 0.84302098, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.15124512, + "step": 3294, + "time_per_iteration": 2.5880134105682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092147, + "balance_loss_mlp": 1.07686436, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.09213521836591561, + "language_loss": 0.79931903, + "learning_rate": 0.0003121897109393017, + "loss": 0.81024045, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.15283203, + "step": 3295, + "time_per_iteration": 2.7655093669891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086202, + "balance_loss_mlp": 1.07047844, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.06242699112369121, + "language_loss": 0.88973814, + "learning_rate": 0.0003119010182479481, + "loss": 0.90060019, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.15710449, + "step": 3296, + "time_per_iteration": 2.631047010421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.07093644, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.06994096564397366, + "language_loss": 0.82599872, + "learning_rate": 0.00031161239858309563, + "loss": 0.83686233, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.15405273, + "step": 3297, + "time_per_iteration": 2.599755048751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086667, + "balance_loss_mlp": 1.07108665, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.09286327126840728, + "language_loss": 0.8328709, + "learning_rate": 0.0003113238520567964, + "loss": 0.8437376, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.15563965, + "step": 3298, + "time_per_iteration": 2.728113889694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088611, + "balance_loss_mlp": 1.07316184, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.09050699432092259, + "language_loss": 0.81456614, + "learning_rate": 0.00031103537878107403, + "loss": 0.82545221, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.15441895, + "step": 3299, + "time_per_iteration": 2.746675729751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_mlp": 1.07576215, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.08418360382923895, + "language_loss": 0.7968322, + "learning_rate": 0.0003107469788679238, + "loss": 0.8077426, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.15246582, + "step": 3300, + "time_per_iteration": 2.7789735794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086389, + "balance_loss_mlp": 1.07030737, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.07428233457329445, + "language_loss": 0.86447507, + "learning_rate": 0.00031045865242931267, + "loss": 0.87533897, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.1607666, + "step": 3301, + "time_per_iteration": 2.8069655895233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096867, + "balance_loss_mlp": 1.08088112, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.07374364047073086, + "language_loss": 0.83124268, + "learning_rate": 0.00031017039957717877, + "loss": 0.84221137, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.15979004, + "step": 3302, + "time_per_iteration": 3.0203216075897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109177, + "balance_loss_mlp": 1.07607031, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.08011037824004849, + "language_loss": 0.88448334, + "learning_rate": 0.0003098822204234318, + "loss": 0.895401, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.15686035, + "step": 3303, + "time_per_iteration": 2.722560405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086918, + "balance_loss_mlp": 1.07146788, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.14532397692109592, + "language_loss": 0.87361807, + "learning_rate": 0.00030959411507995273, + "loss": 0.88448727, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.15429688, + "step": 3304, + "time_per_iteration": 3.2270877361297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109049, + "balance_loss_mlp": 1.07495642, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.07985404208202107, + "language_loss": 0.80787814, + "learning_rate": 0.00030930608365859407, + "loss": 0.8187831, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.15515137, + "step": 3305, + "time_per_iteration": 2.7090413570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.07174611, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.731689338993936, + "language_loss": 0.87885678, + "learning_rate": 0.00030901812627117943, + "loss": 0.88973522, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.16088867, + "step": 3306, + "time_per_iteration": 2.6327977180480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077096, + "balance_loss_mlp": 1.06109858, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.09002939621512045, + "language_loss": 0.84808385, + "learning_rate": 0.000308730243029504, + "loss": 0.85885489, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.15979004, + "step": 3307, + "time_per_iteration": 2.6054556369781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088589, + "balance_loss_mlp": 1.07207811, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.0753497997145879, + "language_loss": 0.79653525, + "learning_rate": 0.0003084424340453339, + "loss": 0.80742109, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.16516113, + "step": 3308, + "time_per_iteration": 2.8042142391204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095017, + "balance_loss_mlp": 1.0775888, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.08328342026231418, + "language_loss": 0.82059419, + "learning_rate": 0.0003081546994304064, + "loss": 0.8315444, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.17443848, + "step": 3309, + "time_per_iteration": 2.7940802574157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100113, + "balance_loss_mlp": 1.08294737, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.07711723091328526, + "language_loss": 0.81634271, + "learning_rate": 0.0003078670392964298, + "loss": 0.82734382, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.17175293, + "step": 3310, + "time_per_iteration": 2.6288981437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111543, + "balance_loss_mlp": 1.09684515, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.09648821040849707, + "language_loss": 0.83039993, + "learning_rate": 0.00030757945375508406, + "loss": 0.84155422, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.18591309, + "step": 3311, + "time_per_iteration": 2.680053472518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120459, + "balance_loss_mlp": 1.10194564, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.07648325408881881, + "language_loss": 0.81110901, + "learning_rate": 0.00030729194291801944, + "loss": 0.82231361, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.18518066, + "step": 3312, + "time_per_iteration": 2.7345173358917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124831, + "balance_loss_mlp": 1.10598445, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.1187576427749129, + "language_loss": 0.76967251, + "learning_rate": 0.00030700450689685787, + "loss": 0.78092086, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.18847656, + "step": 3313, + "time_per_iteration": 2.5925910472869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134943, + "balance_loss_mlp": 1.11620378, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.086714433395562, + "language_loss": 0.85812229, + "learning_rate": 0.00030671714580319186, + "loss": 0.86947167, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.18762207, + "step": 3314, + "time_per_iteration": 2.8684160709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10954893, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07885995957457764, + "language_loss": 0.83140874, + "learning_rate": 0.0003064298597485846, + "loss": 0.84269553, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.19116211, + "step": 3315, + "time_per_iteration": 2.8987390995025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122628, + "balance_loss_mlp": 1.10333991, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.08106722698037498, + "language_loss": 0.84028, + "learning_rate": 0.00030614264884457054, + "loss": 0.85150629, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.19274902, + "step": 3316, + "time_per_iteration": 2.671858787536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112572, + "balance_loss_mlp": 1.09383273, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.09520385776828669, + "language_loss": 0.77556765, + "learning_rate": 0.000305855513202655, + "loss": 0.78669333, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.18725586, + "step": 3317, + "time_per_iteration": 2.6103365421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105702, + "balance_loss_mlp": 1.08714104, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.0870793394439323, + "language_loss": 0.77407163, + "learning_rate": 0.0003055684529343138, + "loss": 0.78512859, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.18566895, + "step": 3318, + "time_per_iteration": 2.4441628456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104453, + "balance_loss_mlp": 1.08614254, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.09431837628284816, + "language_loss": 0.78623343, + "learning_rate": 0.00030528146815099374, + "loss": 0.79727793, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.1829834, + "step": 3319, + "time_per_iteration": 2.6380391120910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092806, + "balance_loss_mlp": 1.07459044, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.0775286688862043, + "language_loss": 0.7192508, + "learning_rate": 0.00030499455896411203, + "loss": 0.73017889, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.18225098, + "step": 3320, + "time_per_iteration": 2.6337239742279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146548, + "balance_loss_mlp": 1.13748848, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.05026445046140725, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77447361, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.09082031, + "step": 3321, + "time_per_iteration": 4.989959239959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080858, + "balance_loss_mlp": 1.06314373, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.29371403446084504, + "language_loss": 0.76736987, + "learning_rate": 0.0003044209678251865, + "loss": 0.77817845, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.17712402, + "step": 3322, + "time_per_iteration": 2.9107608795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_mlp": 1.05879879, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.07557324535671889, + "language_loss": 0.84569478, + "learning_rate": 0.0003041342860958306, + "loss": 0.85645002, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.1673584, + "step": 3323, + "time_per_iteration": 2.7665860652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010742, + "balance_loss_mlp": 1.0572598, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.11260284844343603, + "language_loss": 0.9165262, + "learning_rate": 0.00030384768040828857, + "loss": 0.92726815, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.16931152, + "step": 3324, + "time_per_iteration": 2.6840200424194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075186, + "balance_loss_mlp": 1.05894923, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.08385815306502278, + "language_loss": 0.85726339, + "learning_rate": 0.00030356115087383094, + "loss": 0.86801529, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.16235352, + "step": 3325, + "time_per_iteration": 2.685962200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071194, + "balance_loss_mlp": 1.0543381, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.07882318349260847, + "language_loss": 0.85086048, + "learning_rate": 0.00030327469760369803, + "loss": 0.86157244, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.16870117, + "step": 3326, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075003, + "balance_loss_mlp": 1.05855227, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.09362500195471922, + "language_loss": 0.84774464, + "learning_rate": 0.0003029883207091009, + "loss": 0.8584947, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.16455078, + "step": 3327, + "time_per_iteration": 2.7647178173065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080953, + "balance_loss_mlp": 1.06489587, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.0837002807607971, + "language_loss": 0.7833994, + "learning_rate": 0.00030270202030122095, + "loss": 0.794209, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.16052246, + "step": 3328, + "time_per_iteration": 2.6863620281219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085596, + "balance_loss_mlp": 1.06944346, + "epoch": 0.6404386302424009, + "flos": 819247260672.0, + "grad_norm": 0.12091934143095992, + "language_loss": 0.86217034, + "learning_rate": 0.00030241579649121, + "loss": 0.87302625, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.16149902, + "step": 3329, + "time_per_iteration": 3.0689570903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094276, + "balance_loss_mlp": 1.07826567, + "epoch": 0.6406310119276645, + "flos": 471812677632.0, + "grad_norm": 0.07676724008110788, + "language_loss": 0.79411578, + "learning_rate": 0.00030212964939018994, + "loss": 0.80505848, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.16003418, + "step": 3330, + "time_per_iteration": 2.619704484939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106355, + "balance_loss_mlp": 1.09061956, + "epoch": 0.6408233936129281, + "flos": 425583631872.0, + "grad_norm": 0.1228216310287833, + "language_loss": 0.85246855, + "learning_rate": 0.0003018435791092527, + "loss": 0.86353219, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.15722656, + "step": 3331, + "time_per_iteration": 2.5062122344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111787, + "balance_loss_mlp": 1.09569383, + "epoch": 0.6410157752981916, + "flos": 549784433664.0, + "grad_norm": 0.09250977947547825, + "language_loss": 0.80749196, + "learning_rate": 0.00030155758575946083, + "loss": 0.81860983, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.16088867, + "step": 3332, + "time_per_iteration": 2.649428129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126368, + "balance_loss_mlp": 1.11058497, + "epoch": 0.6412081569834551, + "flos": 475899452928.0, + "grad_norm": 0.08516597392533326, + "language_loss": 0.83713603, + "learning_rate": 0.0003012716694518467, + "loss": 0.8483997, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.15771484, + "step": 3333, + "time_per_iteration": 2.6135807037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132718, + "balance_loss_mlp": 1.11691082, + "epoch": 0.6414005386687187, + "flos": 540921494016.0, + "grad_norm": 0.07646899423626412, + "language_loss": 0.85365057, + "learning_rate": 0.000300985830297413, + "loss": 0.86497772, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.15795898, + "step": 3334, + "time_per_iteration": 2.7174272537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139874, + "balance_loss_mlp": 1.12411475, + "epoch": 0.6415929203539823, + "flos": 1041317379072.0, + "grad_norm": 0.09796479304717164, + "language_loss": 0.87037742, + "learning_rate": 0.00030070006840713205, + "loss": 0.88177609, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.1574707, + "step": 3335, + "time_per_iteration": 3.4015066623687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143096, + "balance_loss_mlp": 1.12753963, + "epoch": 0.6417853020392459, + "flos": 648337996800.0, + "grad_norm": 0.08601362960013602, + "language_loss": 0.73407865, + "learning_rate": 0.000300414383891947, + "loss": 0.74550962, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.15539551, + "step": 3336, + "time_per_iteration": 2.889427661895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147858, + "balance_loss_mlp": 1.13256359, + "epoch": 0.6419776837245095, + "flos": 500899147776.0, + "grad_norm": 0.06457734874365277, + "language_loss": 0.88771486, + "learning_rate": 0.00030012877686276973, + "loss": 0.89919341, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.15270996, + "step": 3337, + "time_per_iteration": 2.7751049995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149838, + "balance_loss_mlp": 1.13451934, + "epoch": 0.642170065409773, + "flos": 620620392960.0, + "grad_norm": 0.07413872787438813, + "language_loss": 0.86947334, + "learning_rate": 0.0002998432474304832, + "loss": 0.88097167, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.1529541, + "step": 3338, + "time_per_iteration": 2.8443615436553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066826, + "balance_loss_mlp": 1.05876791, + "epoch": 0.6423624470950365, + "flos": 1423539629568.0, + "grad_norm": 0.0235298997703447, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.8030417, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.08056641, + "step": 3339, + "time_per_iteration": 4.873133659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146518, + "balance_loss_mlp": 1.13105643, + "epoch": 0.6425548287803001, + "flos": 562353477120.0, + "grad_norm": 0.06001199999117321, + "language_loss": 0.88487816, + "learning_rate": 0.00029927242179996107, + "loss": 0.89634329, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.15441895, + "step": 3340, + "time_per_iteration": 2.7014224529266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144782, + "balance_loss_mlp": 1.12893939, + "epoch": 0.6427472104655637, + "flos": 585443220480.0, + "grad_norm": 0.0682782247360197, + "language_loss": 0.83006454, + "learning_rate": 0.0002989871258233398, + "loss": 0.84151232, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.1583252, + "step": 3341, + "time_per_iteration": 2.7891581058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137179, + "balance_loss_mlp": 1.12174153, + "epoch": 0.6429395921508272, + "flos": 404282700288.0, + "grad_norm": 0.10902678914385976, + "language_loss": 0.82279134, + "learning_rate": 0.0002987019078868373, + "loss": 0.83416307, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.15429688, + "step": 3342, + "time_per_iteration": 2.4937355518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135998, + "balance_loss_mlp": 1.12050128, + "epoch": 0.6431319738360908, + "flos": 548783755776.0, + "grad_norm": 0.07048504684512738, + "language_loss": 0.81617045, + "learning_rate": 0.00029841676810118484, + "loss": 0.8275305, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.15478516, + "step": 3343, + "time_per_iteration": 2.721240997314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011253, + "balance_loss_mlp": 1.10925424, + "epoch": 0.6433243555213544, + "flos": 793375368192.0, + "grad_norm": 0.08414428374798259, + "language_loss": 0.87345612, + "learning_rate": 0.0002981317065770839, + "loss": 0.88470906, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.16040039, + "step": 3344, + "time_per_iteration": 3.06880521774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120449, + "balance_loss_mlp": 1.10509467, + "epoch": 0.643516737206618, + "flos": 583031831040.0, + "grad_norm": 0.10046839688715496, + "language_loss": 0.80932879, + "learning_rate": 0.00029784672342520493, + "loss": 0.82053328, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.15332031, + "step": 3345, + "time_per_iteration": 2.71240496635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118949, + "balance_loss_mlp": 1.10347569, + "epoch": 0.6437091188918815, + "flos": 518750936064.0, + "grad_norm": 0.10277118119313504, + "language_loss": 0.8364169, + "learning_rate": 0.00029756181875618834, + "loss": 0.84760636, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.15454102, + "step": 3346, + "time_per_iteration": 2.589006185531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110068, + "balance_loss_mlp": 1.09397459, + "epoch": 0.643901500577145, + "flos": 384946048512.0, + "grad_norm": 0.07329616069241408, + "language_loss": 0.83350551, + "learning_rate": 0.0002972769926806439, + "loss": 0.84460616, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.16088867, + "step": 3347, + "time_per_iteration": 2.4795889854431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102586, + "balance_loss_mlp": 1.08667207, + "epoch": 0.6440938822624086, + "flos": 483722067456.0, + "grad_norm": 0.08705096327396913, + "language_loss": 0.88483214, + "learning_rate": 0.0002969922453091508, + "loss": 0.89585805, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.15905762, + "step": 3348, + "time_per_iteration": 2.6210718154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.08181691, + "epoch": 0.6442862639476722, + "flos": 540469241856.0, + "grad_norm": 0.07238968090478194, + "language_loss": 0.85106307, + "learning_rate": 0.00029670757675225777, + "loss": 0.86203837, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.15698242, + "step": 3349, + "time_per_iteration": 2.7721433639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110462, + "balance_loss_mlp": 1.08850336, + "epoch": 0.6444786456329358, + "flos": 526912003584.0, + "grad_norm": 0.07515890513129632, + "language_loss": 0.79165089, + "learning_rate": 0.0002964229871204831, + "loss": 0.80269712, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.16113281, + "step": 3350, + "time_per_iteration": 2.6707816123962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095562, + "balance_loss_mlp": 1.0801599, + "epoch": 0.6446710273181993, + "flos": 697892848128.0, + "grad_norm": 0.08798444553042223, + "language_loss": 0.83359706, + "learning_rate": 0.00029613847652431403, + "loss": 0.84455276, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.15380859, + "step": 3351, + "time_per_iteration": 3.076526403427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097548, + "balance_loss_mlp": 1.08212233, + "epoch": 0.6448634090034628, + "flos": 625023226368.0, + "grad_norm": 0.07638162033625162, + "language_loss": 0.79389453, + "learning_rate": 0.0002958540450742078, + "loss": 0.80487001, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.15405273, + "step": 3352, + "time_per_iteration": 2.9522883892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088563, + "balance_loss_mlp": 1.0726012, + "epoch": 0.6450557906887264, + "flos": 600950057472.0, + "grad_norm": 0.0756305542343249, + "language_loss": 0.77240932, + "learning_rate": 0.0002955696928805901, + "loss": 0.78329492, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.1595459, + "step": 3353, + "time_per_iteration": 2.9094340801239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085355, + "balance_loss_mlp": 1.06963158, + "epoch": 0.64524817237399, + "flos": 646200820224.0, + "grad_norm": 0.06844554390728431, + "language_loss": 0.85999632, + "learning_rate": 0.0002952854200538563, + "loss": 0.87084985, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.15710449, + "step": 3354, + "time_per_iteration": 2.820434808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084907, + "balance_loss_mlp": 1.06869507, + "epoch": 0.6454405540592536, + "flos": 473411340288.0, + "grad_norm": 0.06913920875514136, + "language_loss": 0.81889141, + "learning_rate": 0.000295001226704371, + "loss": 0.82974052, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.16210938, + "step": 3355, + "time_per_iteration": 2.5986335277557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108489, + "balance_loss_mlp": 1.06890416, + "epoch": 0.6456329357445171, + "flos": 611841517056.0, + "grad_norm": 0.09885460373784571, + "language_loss": 0.82614869, + "learning_rate": 0.00029471711294246783, + "loss": 0.83699757, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.15979004, + "step": 3356, + "time_per_iteration": 2.7818820476531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108141, + "balance_loss_mlp": 1.06516171, + "epoch": 0.6458253174297807, + "flos": 731683901952.0, + "grad_norm": 0.08380792683960937, + "language_loss": 0.8265574, + "learning_rate": 0.0002944330788784494, + "loss": 0.83737159, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.16247559, + "step": 3357, + "time_per_iteration": 2.8812832832336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.07064033, + "epoch": 0.6460176991150443, + "flos": 570413228544.0, + "grad_norm": 0.06446449543464593, + "language_loss": 0.84539986, + "learning_rate": 0.00029414912462258786, + "loss": 0.85626698, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.16064453, + "step": 3358, + "time_per_iteration": 2.8379344940185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083974, + "balance_loss_mlp": 1.06774938, + "epoch": 0.6462100808003078, + "flos": 583160311296.0, + "grad_norm": 0.10894531444505327, + "language_loss": 0.81342053, + "learning_rate": 0.00029386525028512366, + "loss": 0.82426023, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.16223145, + "step": 3359, + "time_per_iteration": 2.7395105361938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085698, + "balance_loss_mlp": 1.06923473, + "epoch": 0.6464024624855714, + "flos": 483919557120.0, + "grad_norm": 0.0747784188423731, + "language_loss": 0.87245089, + "learning_rate": 0.0002935814559762666, + "loss": 0.88330787, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.16467285, + "step": 3360, + "time_per_iteration": 2.784308433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085107, + "balance_loss_mlp": 1.0693121, + "epoch": 0.6465948441708349, + "flos": 527774289408.0, + "grad_norm": 0.08701816343454002, + "language_loss": 0.79713386, + "learning_rate": 0.0002932977418061957, + "loss": 0.80798495, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.15783691, + "step": 3361, + "time_per_iteration": 2.6461353302001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085334, + "balance_loss_mlp": 1.06910968, + "epoch": 0.6467872258560985, + "flos": 669421615104.0, + "grad_norm": 0.07872387718788462, + "language_loss": 0.80426037, + "learning_rate": 0.00029301410788505833, + "loss": 0.81511372, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.16223145, + "step": 3362, + "time_per_iteration": 2.813366413116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092118, + "balance_loss_mlp": 1.07586968, + "epoch": 0.6469796075413621, + "flos": 432101620224.0, + "grad_norm": 0.12115890254609105, + "language_loss": 0.80987, + "learning_rate": 0.00029273055432297126, + "loss": 0.82079118, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.16247559, + "step": 3363, + "time_per_iteration": 2.511802911758423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084973, + "balance_loss_mlp": 1.06877291, + "epoch": 0.6471719892266257, + "flos": 803750335488.0, + "grad_norm": 0.06785413564758717, + "language_loss": 0.80699545, + "learning_rate": 0.00029244708123001917, + "loss": 0.81784511, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.1619873, + "step": 3364, + "time_per_iteration": 2.980236768722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.06629229, + "epoch": 0.6473643709118891, + "flos": 577208001024.0, + "grad_norm": 0.06532727643194694, + "language_loss": 0.84224701, + "learning_rate": 0.0002921636887162565, + "loss": 0.85307598, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.16601562, + "step": 3365, + "time_per_iteration": 2.7428975105285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108053, + "balance_loss_mlp": 1.06452012, + "epoch": 0.6475567525971527, + "flos": 761420113920.0, + "grad_norm": 0.08750639316887465, + "language_loss": 0.83789468, + "learning_rate": 0.00029188037689170595, + "loss": 0.84869999, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.16015625, + "step": 3366, + "time_per_iteration": 2.982468843460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088124, + "balance_loss_mlp": 1.07188749, + "epoch": 0.6477491342824163, + "flos": 843103116288.0, + "grad_norm": 0.08320193345664485, + "language_loss": 0.84052682, + "learning_rate": 0.0002915971458663586, + "loss": 0.85140812, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.16235352, + "step": 3367, + "time_per_iteration": 3.065324544906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083268, + "balance_loss_mlp": 1.06736565, + "epoch": 0.6479415159676799, + "flos": 884820298752.0, + "grad_norm": 0.05677832621363699, + "language_loss": 0.81828123, + "learning_rate": 0.00029131399575017494, + "loss": 0.82911396, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.15893555, + "step": 3368, + "time_per_iteration": 3.181908130645752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079967, + "balance_loss_mlp": 1.06356418, + "epoch": 0.6481338976529435, + "flos": 615513116160.0, + "grad_norm": 0.06942657132452239, + "language_loss": 0.85825574, + "learning_rate": 0.0002910309266530836, + "loss": 0.86905545, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.16394043, + "step": 3369, + "time_per_iteration": 2.853720188140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_mlp": 1.07184362, + "epoch": 0.648326279338207, + "flos": 510009136128.0, + "grad_norm": 0.07294925801459864, + "language_loss": 0.85398757, + "learning_rate": 0.0002907479386849814, + "loss": 0.86487263, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.16674805, + "step": 3370, + "time_per_iteration": 2.6536483764648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087231, + "balance_loss_mlp": 1.07115006, + "epoch": 0.6485186610234706, + "flos": 702498313728.0, + "grad_norm": 0.07555767339511835, + "language_loss": 0.80052334, + "learning_rate": 0.0002904650319557339, + "loss": 0.81139565, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.1607666, + "step": 3371, + "time_per_iteration": 2.996121644973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109038, + "balance_loss_mlp": 1.07432294, + "epoch": 0.6487110427087341, + "flos": 560683233792.0, + "grad_norm": 0.08951276836264582, + "language_loss": 0.80922645, + "learning_rate": 0.0002901822065751758, + "loss": 0.82013029, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.16052246, + "step": 3372, + "time_per_iteration": 2.6797337532043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091281, + "balance_loss_mlp": 1.07499671, + "epoch": 0.6489034243939977, + "flos": 680100530688.0, + "grad_norm": 0.07760680583189275, + "language_loss": 0.85333431, + "learning_rate": 0.0002898994626531093, + "loss": 0.86424708, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.1628418, + "step": 3373, + "time_per_iteration": 2.9142796993255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096791, + "balance_loss_mlp": 1.08061421, + "epoch": 0.6490958060792612, + "flos": 474412018176.0, + "grad_norm": 0.08009556934664804, + "language_loss": 0.87685299, + "learning_rate": 0.00028961680029930526, + "loss": 0.88782084, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.16174316, + "step": 3374, + "time_per_iteration": 2.6317813396453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093542, + "balance_loss_mlp": 1.07774687, + "epoch": 0.6492881877645248, + "flos": 588850518528.0, + "grad_norm": 0.10403413737610764, + "language_loss": 0.76691556, + "learning_rate": 0.00028933421962350317, + "loss": 0.77785093, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.15783691, + "step": 3375, + "time_per_iteration": 2.74595046043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100726, + "balance_loss_mlp": 1.08450198, + "epoch": 0.6494805694497884, + "flos": 642427905024.0, + "grad_norm": 0.07424311731370936, + "language_loss": 0.83669561, + "learning_rate": 0.0002890517207354104, + "loss": 0.84770286, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.16223145, + "step": 3376, + "time_per_iteration": 2.8577523231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099818, + "balance_loss_mlp": 1.08361781, + "epoch": 0.649672951135052, + "flos": 531806736384.0, + "grad_norm": 0.06432673678328359, + "language_loss": 0.81672311, + "learning_rate": 0.0002887693037447029, + "loss": 0.8277213, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.1619873, + "step": 3377, + "time_per_iteration": 2.6683990955352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105741, + "balance_loss_mlp": 1.08960009, + "epoch": 0.6498653328203156, + "flos": 547387725312.0, + "grad_norm": 0.07747237363715123, + "language_loss": 0.81861436, + "learning_rate": 0.00028848696876102443, + "loss": 0.8296718, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.16137695, + "step": 3378, + "time_per_iteration": 2.6596877574920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104851, + "balance_loss_mlp": 1.08822083, + "epoch": 0.650057714505579, + "flos": 462228415488.0, + "grad_norm": 0.07839901115020462, + "language_loss": 0.83414477, + "learning_rate": 0.00028820471589398723, + "loss": 0.84519327, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.16638184, + "step": 3379, + "time_per_iteration": 2.5814483165740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097332, + "balance_loss_mlp": 1.08126247, + "epoch": 0.6502500961908426, + "flos": 510172121088.0, + "grad_norm": 0.08703202670107367, + "language_loss": 0.77904689, + "learning_rate": 0.00028792254525317196, + "loss": 0.79002023, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.16064453, + "step": 3380, + "time_per_iteration": 2.72318434715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097321, + "balance_loss_mlp": 1.08113241, + "epoch": 0.6504424778761062, + "flos": 579827165184.0, + "grad_norm": 0.1026330165039415, + "language_loss": 0.81341857, + "learning_rate": 0.00028764045694812645, + "loss": 0.82439172, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.16186523, + "step": 3381, + "time_per_iteration": 2.8238747119903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094282, + "balance_loss_mlp": 1.07821298, + "epoch": 0.6506348595613698, + "flos": 519457577472.0, + "grad_norm": 0.08728487629986259, + "language_loss": 0.76526588, + "learning_rate": 0.0002873584510883671, + "loss": 0.7762087, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.16064453, + "step": 3382, + "time_per_iteration": 2.577808380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093616, + "balance_loss_mlp": 1.07766557, + "epoch": 0.6508272412466333, + "flos": 510310513152.0, + "grad_norm": 0.0816081367249066, + "language_loss": 0.8610574, + "learning_rate": 0.0002870765277833788, + "loss": 0.87199354, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.15942383, + "step": 3383, + "time_per_iteration": 2.694575071334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084046, + "balance_loss_mlp": 1.06782174, + "epoch": 0.6510196229318969, + "flos": 625623782400.0, + "grad_norm": 0.07010273860249229, + "language_loss": 0.80279285, + "learning_rate": 0.00028679468714261347, + "loss": 0.81363332, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.16210938, + "step": 3384, + "time_per_iteration": 2.778613805770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108697, + "balance_loss_mlp": 1.07105613, + "epoch": 0.6512120046171604, + "flos": 474696142848.0, + "grad_norm": 0.08179084803360179, + "language_loss": 0.76861918, + "learning_rate": 0.0002865129292754918, + "loss": 0.77948892, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.15905762, + "step": 3385, + "time_per_iteration": 2.579616069793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089266, + "balance_loss_mlp": 1.07348299, + "epoch": 0.651404386302424, + "flos": 551854798848.0, + "grad_norm": 0.07682712514001584, + "language_loss": 0.81799757, + "learning_rate": 0.00028623125429140105, + "loss": 0.82889026, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.15771484, + "step": 3386, + "time_per_iteration": 2.867527961730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088264, + "balance_loss_mlp": 1.07212317, + "epoch": 0.6515967679876876, + "flos": 523311985152.0, + "grad_norm": 0.08394692910439203, + "language_loss": 0.86795825, + "learning_rate": 0.00028594966229969785, + "loss": 0.8788408, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.16137695, + "step": 3387, + "time_per_iteration": 2.729546546936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090562, + "balance_loss_mlp": 1.07511222, + "epoch": 0.6517891496729511, + "flos": 573874854912.0, + "grad_norm": 0.06943959400657512, + "language_loss": 0.80838251, + "learning_rate": 0.00028566815340970577, + "loss": 0.81928813, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.15429688, + "step": 3388, + "time_per_iteration": 2.7792935371398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087811, + "balance_loss_mlp": 1.07234919, + "epoch": 0.6519815313582147, + "flos": 555926893056.0, + "grad_norm": 0.10254381492553745, + "language_loss": 0.80897045, + "learning_rate": 0.0002853867277307162, + "loss": 0.81984854, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.15441895, + "step": 3389, + "time_per_iteration": 2.6666784286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087817, + "balance_loss_mlp": 1.07184315, + "epoch": 0.6521739130434783, + "flos": 480487666176.0, + "grad_norm": 0.06532744179053884, + "language_loss": 0.82385129, + "learning_rate": 0.00028510538537198824, + "loss": 0.83472943, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.15966797, + "step": 3390, + "time_per_iteration": 2.6052095890045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088167, + "balance_loss_mlp": 1.0724194, + "epoch": 0.6523662947287419, + "flos": 665707797504.0, + "grad_norm": 0.0671667027418021, + "language_loss": 0.86489713, + "learning_rate": 0.00028482412644274867, + "loss": 0.87577885, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.15734863, + "step": 3391, + "time_per_iteration": 2.987199068069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086228, + "balance_loss_mlp": 1.07036138, + "epoch": 0.6525586764140053, + "flos": 548655275520.0, + "grad_norm": 0.07499708088395778, + "language_loss": 0.74256724, + "learning_rate": 0.00028454295105219207, + "loss": 0.75342953, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.15856934, + "step": 3392, + "time_per_iteration": 2.6695480346679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.07157481, + "epoch": 0.6527510580992689, + "flos": 802900159488.0, + "grad_norm": 0.1096122029858208, + "language_loss": 0.79101622, + "learning_rate": 0.0002842618593094802, + "loss": 0.80188692, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.15478516, + "step": 3393, + "time_per_iteration": 3.142174005508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091326, + "balance_loss_mlp": 1.07560241, + "epoch": 0.6529434397845325, + "flos": 671166010368.0, + "grad_norm": 0.19909824344708926, + "language_loss": 0.80162621, + "learning_rate": 0.00028398085132374243, + "loss": 0.81253946, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.15710449, + "step": 3394, + "time_per_iteration": 2.806171178817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088691, + "balance_loss_mlp": 1.0730865, + "epoch": 0.6531358214697961, + "flos": 828409006080.0, + "grad_norm": 0.0722395804995529, + "language_loss": 0.84118348, + "learning_rate": 0.0002836999272040761, + "loss": 0.85207039, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.15588379, + "step": 3395, + "time_per_iteration": 3.128824472427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084488, + "balance_loss_mlp": 1.06822813, + "epoch": 0.6533282031550597, + "flos": 487403578368.0, + "grad_norm": 0.09318824389102451, + "language_loss": 0.84164572, + "learning_rate": 0.00028341908705954575, + "loss": 0.85249066, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.16259766, + "step": 3396, + "time_per_iteration": 2.557788848876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_mlp": 1.02878892, + "epoch": 0.6535205848403232, + "flos": 1557744638976.0, + "grad_norm": 0.02499747556734328, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82797897, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.07275391, + "step": 3397, + "time_per_iteration": 4.86290979385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_mlp": 1.07970405, + "epoch": 0.6537129665255867, + "flos": 493711593984.0, + "grad_norm": 0.0751394003392184, + "language_loss": 0.78380162, + "learning_rate": 0.00028285765913198604, + "loss": 0.7947588, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.16003418, + "step": 3398, + "time_per_iteration": 2.600771903991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098784, + "balance_loss_mlp": 1.08344197, + "epoch": 0.6539053482108503, + "flos": 605002328064.0, + "grad_norm": 0.09108234691861208, + "language_loss": 0.81861216, + "learning_rate": 0.0002825770715669227, + "loss": 0.82960004, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.15319824, + "step": 3399, + "time_per_iteration": 2.737407684326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109606, + "balance_loss_mlp": 1.08050275, + "epoch": 0.6540977298961139, + "flos": 577778821632.0, + "grad_norm": 0.06648810793188806, + "language_loss": 0.81205964, + "learning_rate": 0.00028229656841292634, + "loss": 0.82302022, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.15539551, + "step": 3400, + "time_per_iteration": 2.772174596786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094687, + "balance_loss_mlp": 1.07884359, + "epoch": 0.6542901115813774, + "flos": 511753531392.0, + "grad_norm": 0.09125126415634116, + "language_loss": 0.76617396, + "learning_rate": 0.0002820161497788979, + "loss": 0.77712083, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.1583252, + "step": 3401, + "time_per_iteration": 2.5956950187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097852, + "balance_loss_mlp": 1.08250988, + "epoch": 0.654482493266641, + "flos": 625495302144.0, + "grad_norm": 0.07069704571698167, + "language_loss": 0.8703959, + "learning_rate": 0.00028173581577370545, + "loss": 0.88137436, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.15332031, + "step": 3402, + "time_per_iteration": 2.781916379928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092911, + "balance_loss_mlp": 1.0775454, + "epoch": 0.6546748749519046, + "flos": 523981550592.0, + "grad_norm": 0.07155783234784782, + "language_loss": 0.7877273, + "learning_rate": 0.0002814555665061844, + "loss": 0.7986564, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.15356445, + "step": 3403, + "time_per_iteration": 2.6751809120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097131, + "balance_loss_mlp": 1.08174062, + "epoch": 0.6548672566371682, + "flos": 479210204160.0, + "grad_norm": 0.0786668286900623, + "language_loss": 0.77486473, + "learning_rate": 0.00028117540208513715, + "loss": 0.78583604, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.15368652, + "step": 3404, + "time_per_iteration": 2.749115228652954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102022, + "balance_loss_mlp": 1.08668029, + "epoch": 0.6550596383224317, + "flos": 616012356096.0, + "grad_norm": 0.08182139934460984, + "language_loss": 0.84582227, + "learning_rate": 0.00028089532261933313, + "loss": 0.85684246, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.15319824, + "step": 3405, + "time_per_iteration": 2.7621490955352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103666, + "balance_loss_mlp": 1.08819294, + "epoch": 0.6552520200076952, + "flos": 488836684800.0, + "grad_norm": 0.09214410473425906, + "language_loss": 0.85497427, + "learning_rate": 0.0002806153282175087, + "loss": 0.8660109, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.15454102, + "step": 3406, + "time_per_iteration": 2.554258346557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106093, + "balance_loss_mlp": 1.09101284, + "epoch": 0.6554444016929588, + "flos": 687619196928.0, + "grad_norm": 0.08390483637961621, + "language_loss": 0.8305704, + "learning_rate": 0.0002803354189883679, + "loss": 0.84163129, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.15063477, + "step": 3407, + "time_per_iteration": 2.8450276851654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106209, + "balance_loss_mlp": 1.09085512, + "epoch": 0.6556367833782224, + "flos": 543051330048.0, + "grad_norm": 0.0734148655428184, + "language_loss": 0.85336381, + "learning_rate": 0.00028005559504058053, + "loss": 0.8644259, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.15332031, + "step": 3408, + "time_per_iteration": 2.7837629318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093577, + "balance_loss_mlp": 1.07772207, + "epoch": 0.655829165063486, + "flos": 673535554560.0, + "grad_norm": 0.16856049180871682, + "language_loss": 0.76636934, + "learning_rate": 0.0002797758564827838, + "loss": 0.77730507, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.15844727, + "step": 3409, + "time_per_iteration": 2.822136402130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110799, + "balance_loss_mlp": 1.09300518, + "epoch": 0.6560215467487496, + "flos": 531806736384.0, + "grad_norm": 0.07117651183978699, + "language_loss": 0.83604807, + "learning_rate": 0.0002794962034235824, + "loss": 0.84712797, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.14953613, + "step": 3410, + "time_per_iteration": 2.66369891166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099109, + "balance_loss_mlp": 1.08368373, + "epoch": 0.656213928434013, + "flos": 591311467008.0, + "grad_norm": 0.08491575127414114, + "language_loss": 0.7504462, + "learning_rate": 0.00027921663597154695, + "loss": 0.7614373, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.15405273, + "step": 3411, + "time_per_iteration": 2.7551727294921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092638, + "balance_loss_mlp": 1.07776022, + "epoch": 0.6564063101192766, + "flos": 415786825728.0, + "grad_norm": 0.11157782453846309, + "language_loss": 0.8107205, + "learning_rate": 0.00027893715423521525, + "loss": 0.82164693, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.14868164, + "step": 3412, + "time_per_iteration": 2.472046375274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091359, + "balance_loss_mlp": 1.07526577, + "epoch": 0.6565986918045402, + "flos": 453321059328.0, + "grad_norm": 0.09147019628997788, + "language_loss": 0.83780456, + "learning_rate": 0.00027865775832309163, + "loss": 0.84871817, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.16088867, + "step": 3413, + "time_per_iteration": 2.66375470161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095598, + "balance_loss_mlp": 1.08004141, + "epoch": 0.6567910734898038, + "flos": 547746001920.0, + "grad_norm": 0.06942494877025387, + "language_loss": 0.86058021, + "learning_rate": 0.00027837844834364733, + "loss": 0.8715362, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.15539551, + "step": 3414, + "time_per_iteration": 2.664980173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06337631, + "epoch": 0.6569834551750673, + "flos": 655518210048.0, + "grad_norm": 0.06137400431250788, + "language_loss": 0.86412358, + "learning_rate": 0.00027809922440532, + "loss": 0.87491024, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.15270996, + "step": 3415, + "time_per_iteration": 2.8617782592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078566, + "balance_loss_mlp": 1.06294966, + "epoch": 0.6571758368603309, + "flos": 539681107968.0, + "grad_norm": 0.07494876333939839, + "language_loss": 0.80630636, + "learning_rate": 0.00027782008661651406, + "loss": 0.81709206, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.15600586, + "step": 3416, + "time_per_iteration": 2.7988359928131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079692, + "balance_loss_mlp": 1.06403971, + "epoch": 0.6573682185455945, + "flos": 497346117120.0, + "grad_norm": 0.0566932008774944, + "language_loss": 0.87253273, + "learning_rate": 0.00027754103508560013, + "loss": 0.88332963, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.15637207, + "step": 3417, + "time_per_iteration": 2.63442063331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083157, + "balance_loss_mlp": 1.06763625, + "epoch": 0.657560600230858, + "flos": 447465295872.0, + "grad_norm": 0.07288802055505218, + "language_loss": 0.82797182, + "learning_rate": 0.0002772620699209163, + "loss": 0.83880341, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.1550293, + "step": 3418, + "time_per_iteration": 2.5753917694091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082218, + "balance_loss_mlp": 1.06645882, + "epoch": 0.6577529819161216, + "flos": 481940596224.0, + "grad_norm": 0.08272412201155772, + "language_loss": 0.79880875, + "learning_rate": 0.0002769831912307658, + "loss": 0.80963099, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.1574707, + "step": 3419, + "time_per_iteration": 2.57000732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087297, + "balance_loss_mlp": 1.07162082, + "epoch": 0.6579453636013851, + "flos": 530843134464.0, + "grad_norm": 0.09925745188023151, + "language_loss": 0.80301243, + "learning_rate": 0.00027670439912341917, + "loss": 0.81388539, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.15661621, + "step": 3420, + "time_per_iteration": 2.6356050968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108308, + "balance_loss_mlp": 1.06716549, + "epoch": 0.6581377452866487, + "flos": 628037743104.0, + "grad_norm": 0.0799291491057382, + "language_loss": 0.8347252, + "learning_rate": 0.0002764256937071129, + "loss": 0.84555596, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.15905762, + "step": 3421, + "time_per_iteration": 2.834642171859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108912, + "balance_loss_mlp": 1.07362247, + "epoch": 0.6583301269719123, + "flos": 548618199552.0, + "grad_norm": 0.07238989087994178, + "language_loss": 0.87290871, + "learning_rate": 0.00027614707509005036, + "loss": 0.88379991, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.15478516, + "step": 3422, + "time_per_iteration": 2.725083112716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094754, + "balance_loss_mlp": 1.0789113, + "epoch": 0.6585225086571759, + "flos": 427493583360.0, + "grad_norm": 0.07227108008700142, + "language_loss": 0.79197395, + "learning_rate": 0.0002758685433804008, + "loss": 0.80292153, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.15844727, + "step": 3423, + "time_per_iteration": 2.487705945968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088641, + "balance_loss_mlp": 1.07272685, + "epoch": 0.6587148903424394, + "flos": 859620542976.0, + "grad_norm": 0.08320385710428496, + "language_loss": 0.7929312, + "learning_rate": 0.00027559009868630005, + "loss": 0.80381757, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.15905762, + "step": 3424, + "time_per_iteration": 3.1158218383789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089311, + "balance_loss_mlp": 1.07396901, + "epoch": 0.6589072720277029, + "flos": 805630551552.0, + "grad_norm": 0.09716201844809616, + "language_loss": 0.79885697, + "learning_rate": 0.0002753117411158491, + "loss": 0.80975008, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.15319824, + "step": 3425, + "time_per_iteration": 3.073878049850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087694, + "balance_loss_mlp": 1.07177925, + "epoch": 0.6590996537129665, + "flos": 548618199552.0, + "grad_norm": 0.07092925892223757, + "language_loss": 0.89722019, + "learning_rate": 0.0002750334707771168, + "loss": 0.90809715, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.15905762, + "step": 3426, + "time_per_iteration": 2.656442403793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088978, + "balance_loss_mlp": 1.07301569, + "epoch": 0.6592920353982301, + "flos": 454166092800.0, + "grad_norm": 0.08902896805701885, + "language_loss": 0.81059277, + "learning_rate": 0.0002747552877781369, + "loss": 0.82148254, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.1595459, + "step": 3427, + "time_per_iteration": 2.5491814613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078822, + "balance_loss_mlp": 1.06326556, + "epoch": 0.6594844170834937, + "flos": 567174057984.0, + "grad_norm": 0.07347862596751242, + "language_loss": 0.82162035, + "learning_rate": 0.0002744771922269097, + "loss": 0.83240855, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.15539551, + "step": 3428, + "time_per_iteration": 2.7601091861724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079567, + "balance_loss_mlp": 1.06411695, + "epoch": 0.6596767987687572, + "flos": 1187911194624.0, + "grad_norm": 0.06220461144650757, + "language_loss": 0.81962168, + "learning_rate": 0.0002741991842314015, + "loss": 0.83041739, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.15429688, + "step": 3429, + "time_per_iteration": 3.5186891555786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073966, + "balance_loss_mlp": 1.05820632, + "epoch": 0.6598691804540208, + "flos": 503491147776.0, + "grad_norm": 0.08620393230779241, + "language_loss": 0.85952473, + "learning_rate": 0.0002739212638995445, + "loss": 0.87026429, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.15759277, + "step": 3430, + "time_per_iteration": 2.5625457763671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076808, + "balance_loss_mlp": 1.0606432, + "epoch": 0.6600615621392844, + "flos": 531337231872.0, + "grad_norm": 0.07532946000641748, + "language_loss": 0.82907259, + "learning_rate": 0.00027364343133923696, + "loss": 0.83984065, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.16162109, + "step": 3431, + "time_per_iteration": 2.716170310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.06649864, + "epoch": 0.6602539438245479, + "flos": 565446915072.0, + "grad_norm": 0.08436879524299454, + "language_loss": 0.82516879, + "learning_rate": 0.0002733656866583431, + "loss": 0.83599138, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.1574707, + "step": 3432, + "time_per_iteration": 2.718897581100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081528, + "balance_loss_mlp": 1.06595933, + "epoch": 0.6604463255098114, + "flos": 857159594496.0, + "grad_norm": 0.09838634377037979, + "language_loss": 0.82723475, + "learning_rate": 0.0002730880299646927, + "loss": 0.83805001, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.15551758, + "step": 3433, + "time_per_iteration": 3.057305097579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077745, + "balance_loss_mlp": 1.06209278, + "epoch": 0.660638707195075, + "flos": 674462080512.0, + "grad_norm": 0.08984858233417439, + "language_loss": 0.85275245, + "learning_rate": 0.0002728104613660821, + "loss": 0.86352998, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.15637207, + "step": 3434, + "time_per_iteration": 2.9105958938598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088498, + "balance_loss_mlp": 1.0729531, + "epoch": 0.6608310888803386, + "flos": 888961402368.0, + "grad_norm": 0.07627448078148022, + "language_loss": 0.83088267, + "learning_rate": 0.0002725329809702729, + "loss": 0.84176767, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.15527344, + "step": 3435, + "time_per_iteration": 3.2121472358703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084827, + "balance_loss_mlp": 1.06942487, + "epoch": 0.6610234705656022, + "flos": 1136347646976.0, + "grad_norm": 0.07852463720700995, + "language_loss": 0.761163, + "learning_rate": 0.0002722555888849921, + "loss": 0.77201122, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.15380859, + "step": 3436, + "time_per_iteration": 3.4559333324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086334, + "balance_loss_mlp": 1.07087219, + "epoch": 0.6612158522508658, + "flos": 468012598272.0, + "grad_norm": 0.0792973185354779, + "language_loss": 0.80341804, + "learning_rate": 0.00027197828521793334, + "loss": 0.8142814, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.15441895, + "step": 3437, + "time_per_iteration": 2.5690367221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083868, + "balance_loss_mlp": 1.06843066, + "epoch": 0.6614082339361292, + "flos": 571653614592.0, + "grad_norm": 0.06733043593984989, + "language_loss": 0.84736151, + "learning_rate": 0.0002717010700767552, + "loss": 0.85820019, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.1541748, + "step": 3438, + "time_per_iteration": 2.730611562728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086291, + "balance_loss_mlp": 1.070472, + "epoch": 0.6616006156213928, + "flos": 498467934720.0, + "grad_norm": 0.08565730680483608, + "language_loss": 0.75949776, + "learning_rate": 0.00027142394356908226, + "loss": 0.77036071, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.15808105, + "step": 3439, + "time_per_iteration": 2.591198444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085276, + "balance_loss_mlp": 1.06955266, + "epoch": 0.6617929973066564, + "flos": 602420239872.0, + "grad_norm": 0.0872499602859338, + "language_loss": 0.84998727, + "learning_rate": 0.00027114690580250456, + "loss": 0.86083996, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.15710449, + "step": 3440, + "time_per_iteration": 2.8140435218811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_mlp": 1.07024395, + "epoch": 0.66198537899192, + "flos": 522983443968.0, + "grad_norm": 0.08876917190153409, + "language_loss": 0.87157035, + "learning_rate": 0.0002708699568845776, + "loss": 0.88242811, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.15515137, + "step": 3441, + "time_per_iteration": 2.6945717334747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009938, + "balance_loss_mlp": 1.0022608, + "epoch": 0.6621777606771835, + "flos": 1566256642560.0, + "grad_norm": 0.011925330418983318, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.8029772, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.07666016, + "step": 3442, + "time_per_iteration": 2.3348398208618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084289, + "balance_loss_mlp": 1.06864882, + "epoch": 0.6623701423624471, + "flos": 526664954880.0, + "grad_norm": 0.08033260761499388, + "language_loss": 0.83087707, + "learning_rate": 0.0002703163260247261, + "loss": 0.84171999, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.15625, + "step": 3443, + "time_per_iteration": 2.6553611755371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088719, + "balance_loss_mlp": 1.07361555, + "epoch": 0.6625625240477107, + "flos": 528179553792.0, + "grad_norm": 0.07663781352911521, + "language_loss": 0.81678534, + "learning_rate": 0.0002700396442977399, + "loss": 0.82767254, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.15087891, + "step": 3444, + "time_per_iteration": 2.6430938243865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085928, + "balance_loss_mlp": 1.07055044, + "epoch": 0.6627549057329742, + "flos": 473122073088.0, + "grad_norm": 0.07089600928708202, + "language_loss": 0.8410843, + "learning_rate": 0.0002697630518492817, + "loss": 0.85194361, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.15356445, + "step": 3445, + "time_per_iteration": 2.6689202785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083941, + "balance_loss_mlp": 1.06836057, + "epoch": 0.6629472874182378, + "flos": 527996745216.0, + "grad_norm": 0.062094097648188885, + "language_loss": 0.85618681, + "learning_rate": 0.0002694865487867343, + "loss": 0.86702615, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.15563965, + "step": 3446, + "time_per_iteration": 2.637334108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080989, + "balance_loss_mlp": 1.0658257, + "epoch": 0.6631396691035013, + "flos": 613200471552.0, + "grad_norm": 0.05358697815550736, + "language_loss": 0.84430885, + "learning_rate": 0.0002692101352174453, + "loss": 0.85511881, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.15148926, + "step": 3447, + "time_per_iteration": 2.8312788009643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088399, + "balance_loss_mlp": 1.0726161, + "epoch": 0.6633320507887649, + "flos": 609318899712.0, + "grad_norm": 0.09194525285750592, + "language_loss": 0.84284896, + "learning_rate": 0.00026893381124872787, + "loss": 0.85373294, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.15771484, + "step": 3448, + "time_per_iteration": 2.736720323562622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088634, + "balance_loss_mlp": 1.0732677, + "epoch": 0.6635244324740285, + "flos": 749700873216.0, + "grad_norm": 0.07278095853134793, + "language_loss": 0.80550098, + "learning_rate": 0.00026865757698786097, + "loss": 0.8163873, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.15344238, + "step": 3449, + "time_per_iteration": 3.0738682746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089191, + "balance_loss_mlp": 1.07368147, + "epoch": 0.6637168141592921, + "flos": 664526882304.0, + "grad_norm": 0.07101525688277542, + "language_loss": 0.8160826, + "learning_rate": 0.000268381432542088, + "loss": 0.82697451, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.15490723, + "step": 3450, + "time_per_iteration": 2.8799893856048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.06351662, + "epoch": 0.6639091958445555, + "flos": 606783799296.0, + "grad_norm": 0.07040751242592289, + "language_loss": 0.79836357, + "learning_rate": 0.00026810537801861807, + "loss": 0.80915618, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.15734863, + "step": 3451, + "time_per_iteration": 2.763744831085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080047, + "balance_loss_mlp": 1.06437111, + "epoch": 0.6641015775298191, + "flos": 476697498624.0, + "grad_norm": 0.06736920749763163, + "language_loss": 0.81280744, + "learning_rate": 0.0002678294135246243, + "loss": 0.82360792, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.15661621, + "step": 3452, + "time_per_iteration": 2.7365012168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077602, + "balance_loss_mlp": 1.06230712, + "epoch": 0.6642939592150827, + "flos": 904115105280.0, + "grad_norm": 0.09242159169716382, + "language_loss": 0.86158502, + "learning_rate": 0.0002675535391672463, + "loss": 0.872361, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.15270996, + "step": 3453, + "time_per_iteration": 3.123499870300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074699, + "balance_loss_mlp": 1.05919051, + "epoch": 0.6644863409003463, + "flos": 581808697344.0, + "grad_norm": 0.061204166777709124, + "language_loss": 0.86025405, + "learning_rate": 0.0002672777550535877, + "loss": 0.87100101, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.15490723, + "step": 3454, + "time_per_iteration": 2.810504913330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075076, + "balance_loss_mlp": 1.05937576, + "epoch": 0.6646787225856099, + "flos": 479002802688.0, + "grad_norm": 0.06544819992016976, + "language_loss": 0.85221612, + "learning_rate": 0.00026700206129071747, + "loss": 0.8629669, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.15686035, + "step": 3455, + "time_per_iteration": 2.55995774269104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077461, + "balance_loss_mlp": 1.0617609, + "epoch": 0.6648711042708734, + "flos": 449906420736.0, + "grad_norm": 0.0675654984198721, + "language_loss": 0.88715339, + "learning_rate": 0.00026672645798566925, + "loss": 0.897928, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.15698242, + "step": 3456, + "time_per_iteration": 2.535385847091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070254, + "balance_loss_mlp": 1.05481672, + "epoch": 0.665063485956137, + "flos": 858960516096.0, + "grad_norm": 0.09542043769716892, + "language_loss": 0.79432011, + "learning_rate": 0.00026645094524544225, + "loss": 0.80502266, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.1541748, + "step": 3457, + "time_per_iteration": 3.309166193008423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076037, + "balance_loss_mlp": 1.05976462, + "epoch": 0.6652558676414005, + "flos": 604312939008.0, + "grad_norm": 0.07016306798114455, + "language_loss": 0.75077713, + "learning_rate": 0.00026617552317699945, + "loss": 0.76153749, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.16271973, + "step": 3458, + "time_per_iteration": 2.7976672649383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070804, + "balance_loss_mlp": 1.05473471, + "epoch": 0.6654482493266641, + "flos": 510394576896.0, + "grad_norm": 0.0843552623748082, + "language_loss": 0.87185872, + "learning_rate": 0.0002659001918872693, + "loss": 0.88256675, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.16052246, + "step": 3459, + "time_per_iteration": 2.619687080383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077284, + "balance_loss_mlp": 1.06169105, + "epoch": 0.6656406310119277, + "flos": 565605130752.0, + "grad_norm": 0.07816001351873277, + "language_loss": 0.81045449, + "learning_rate": 0.0002656249514831449, + "loss": 0.82122731, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.15576172, + "step": 3460, + "time_per_iteration": 2.658799409866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081416, + "balance_loss_mlp": 1.06559706, + "epoch": 0.6658330126971912, + "flos": 1024298141184.0, + "grad_norm": 0.06934435326228823, + "language_loss": 0.87209398, + "learning_rate": 0.00026534980207148416, + "loss": 0.88290811, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.15808105, + "step": 3461, + "time_per_iteration": 3.4665892124176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080438, + "balance_loss_mlp": 1.06466627, + "epoch": 0.6660253943824548, + "flos": 816823388160.0, + "grad_norm": 0.08829817296611697, + "language_loss": 0.73540372, + "learning_rate": 0.0002650747437591097, + "loss": 0.74620807, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.15759277, + "step": 3462, + "time_per_iteration": 3.0266518592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011257, + "balance_loss_mlp": 1.00381792, + "epoch": 0.6662177760677184, + "flos": 1496169169920.0, + "grad_norm": 0.013095033581449731, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82890832, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.07421875, + "step": 3463, + "time_per_iteration": 5.052983045578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079655, + "balance_loss_mlp": 1.06390786, + "epoch": 0.666410157752982, + "flos": 500120925696.0, + "grad_norm": 0.06636905454287767, + "language_loss": 0.8624109, + "learning_rate": 0.00026452490085933155, + "loss": 0.87320745, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.15734863, + "step": 3464, + "time_per_iteration": 2.6581099033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108047, + "balance_loss_mlp": 1.06460285, + "epoch": 0.6666025394382454, + "flos": 481169714688.0, + "grad_norm": 0.08202270474579819, + "language_loss": 0.89890295, + "learning_rate": 0.00026425011648539614, + "loss": 0.90970761, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.15856934, + "step": 3465, + "time_per_iteration": 2.5627684593200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087333, + "balance_loss_mlp": 1.07175291, + "epoch": 0.666794921123509, + "flos": 546653919744.0, + "grad_norm": 0.06505185619170838, + "language_loss": 0.82792681, + "learning_rate": 0.00026397542363768267, + "loss": 0.83880019, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.15563965, + "step": 3466, + "time_per_iteration": 2.6587183475494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088421, + "balance_loss_mlp": 1.07291174, + "epoch": 0.6669873028087726, + "flos": 471988145664.0, + "grad_norm": 0.07283561194879179, + "language_loss": 0.8210032, + "learning_rate": 0.0002637008224228362, + "loss": 0.83188736, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.1550293, + "step": 3467, + "time_per_iteration": 2.5522584915161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_mlp": 1.08923101, + "epoch": 0.6671796844940362, + "flos": 547395065856.0, + "grad_norm": 0.11764859444366832, + "language_loss": 0.84226644, + "learning_rate": 0.00026342631294746653, + "loss": 0.85331088, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.15185547, + "step": 3468, + "time_per_iteration": 2.743476629257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097687, + "balance_loss_mlp": 1.08171296, + "epoch": 0.6673720661792998, + "flos": 1070317214208.0, + "grad_norm": 0.06605655579182129, + "language_loss": 0.80559593, + "learning_rate": 0.0002631518953181476, + "loss": 0.81657279, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.15966797, + "step": 3469, + "time_per_iteration": 3.4870567321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020288, + "balance_loss_mlp": 1.01294446, + "epoch": 0.6675644478645633, + "flos": 1523790600192.0, + "grad_norm": 0.018081388353692853, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77345574, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.07324219, + "step": 3470, + "time_per_iteration": 4.947405576705933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111899, + "balance_loss_mlp": 1.09661639, + "epoch": 0.6677568295498268, + "flos": 579696113664.0, + "grad_norm": 0.07102946995749101, + "language_loss": 0.80474597, + "learning_rate": 0.00026260333602377985, + "loss": 0.81586492, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.15258789, + "step": 3471, + "time_per_iteration": 2.766829490661621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111832, + "balance_loss_mlp": 1.10340738, + "epoch": 0.6679492112350904, + "flos": 383935458816.0, + "grad_norm": 0.07718732975818399, + "language_loss": 0.87198675, + "learning_rate": 0.0002623291945717007, + "loss": 0.88316995, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.14892578, + "step": 3472, + "time_per_iteration": 2.442732095718384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119016, + "balance_loss_mlp": 1.10393596, + "epoch": 0.668141592920354, + "flos": 1150759830528.0, + "grad_norm": 0.08543337632227696, + "language_loss": 0.83852649, + "learning_rate": 0.00026205514539161175, + "loss": 0.84971666, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.15063477, + "step": 3473, + "time_per_iteration": 3.555755376815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110415, + "balance_loss_mlp": 1.09519196, + "epoch": 0.6683339746056175, + "flos": 561100608000.0, + "grad_norm": 0.08303995839416844, + "language_loss": 0.84348154, + "learning_rate": 0.00026178118858990773, + "loss": 0.85458577, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.15209961, + "step": 3474, + "time_per_iteration": 2.871601104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101729, + "balance_loss_mlp": 1.0864346, + "epoch": 0.6685263562908811, + "flos": 514305884160.0, + "grad_norm": 0.07741994536037751, + "language_loss": 0.84034884, + "learning_rate": 0.0002615073242729483, + "loss": 0.8513661, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.15283203, + "step": 3475, + "time_per_iteration": 2.643118381500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105115, + "balance_loss_mlp": 1.08948672, + "epoch": 0.6687187379761447, + "flos": 629772226560.0, + "grad_norm": 0.06829257920767222, + "language_loss": 0.84361756, + "learning_rate": 0.0002612335525470573, + "loss": 0.85466868, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.15612793, + "step": 3476, + "time_per_iteration": 2.819558620452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095918, + "balance_loss_mlp": 1.0802896, + "epoch": 0.6689111196614083, + "flos": 535586992128.0, + "grad_norm": 0.0704314528476978, + "language_loss": 0.77963984, + "learning_rate": 0.0002609598735185221, + "loss": 0.79059905, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.15612793, + "step": 3477, + "time_per_iteration": 2.690528631210327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.07150483, + "epoch": 0.6691035013466718, + "flos": 603038048256.0, + "grad_norm": 0.061090592598961185, + "language_loss": 0.83056384, + "learning_rate": 0.00026068628729359445, + "loss": 0.8414346, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.15563965, + "step": 3478, + "time_per_iteration": 2.78446364402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085998, + "balance_loss_mlp": 1.07035732, + "epoch": 0.6692958830319353, + "flos": 632855752704.0, + "grad_norm": 0.06337221055834462, + "language_loss": 0.76063514, + "learning_rate": 0.00026041279397848996, + "loss": 0.7714951, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.15625, + "step": 3479, + "time_per_iteration": 2.868635654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_mlp": 1.06561804, + "epoch": 0.6694882647171989, + "flos": 645471783936.0, + "grad_norm": 0.06666053419548093, + "language_loss": 0.82793164, + "learning_rate": 0.00026013939367938797, + "loss": 0.83874393, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.15600586, + "step": 3480, + "time_per_iteration": 2.908998489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077089, + "balance_loss_mlp": 1.0618062, + "epoch": 0.6696806464024625, + "flos": 569585447424.0, + "grad_norm": 0.068968619676177, + "language_loss": 0.80981463, + "learning_rate": 0.00025986608650243204, + "loss": 0.82058555, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.15258789, + "step": 3481, + "time_per_iteration": 2.834899663925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073739, + "balance_loss_mlp": 1.05738401, + "epoch": 0.6698730280877261, + "flos": 622700669952.0, + "grad_norm": 0.06754459840713438, + "language_loss": 0.79205263, + "learning_rate": 0.0002595928725537293, + "loss": 0.80279005, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.16357422, + "step": 3482, + "time_per_iteration": 2.8607449531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075932, + "balance_loss_mlp": 1.0601126, + "epoch": 0.6700654097729896, + "flos": 502507722240.0, + "grad_norm": 0.07337554389160682, + "language_loss": 0.87835222, + "learning_rate": 0.0002593197519393509, + "loss": 0.88911158, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.15808105, + "step": 3483, + "time_per_iteration": 2.600332021713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074206, + "balance_loss_mlp": 1.05839944, + "epoch": 0.6702577914582531, + "flos": 623876815872.0, + "grad_norm": 0.05836331293151798, + "language_loss": 0.79302973, + "learning_rate": 0.00025904672476533165, + "loss": 0.80377179, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.15795898, + "step": 3484, + "time_per_iteration": 2.883197546005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073722, + "balance_loss_mlp": 1.05790257, + "epoch": 0.6704501731435167, + "flos": 456268764672.0, + "grad_norm": 0.07153016836752667, + "language_loss": 0.82456666, + "learning_rate": 0.0002587737911376704, + "loss": 0.8353039, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.15808105, + "step": 3485, + "time_per_iteration": 2.6315789222717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066049, + "balance_loss_mlp": 1.04982471, + "epoch": 0.6706425548287803, + "flos": 543229369344.0, + "grad_norm": 0.07097685042918324, + "language_loss": 0.84046322, + "learning_rate": 0.00025850095116232885, + "loss": 0.85112369, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.16223145, + "step": 3486, + "time_per_iteration": 2.717060089111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067626, + "balance_loss_mlp": 1.05155683, + "epoch": 0.6708349365140439, + "flos": 633940494336.0, + "grad_norm": 0.07403350105376266, + "language_loss": 0.77802885, + "learning_rate": 0.000258228204945233, + "loss": 0.78870511, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.16052246, + "step": 3487, + "time_per_iteration": 2.933227777481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071197, + "balance_loss_mlp": 1.05525851, + "epoch": 0.6710273181993074, + "flos": 640747749888.0, + "grad_norm": 0.07338274313839728, + "language_loss": 0.8460936, + "learning_rate": 0.00025795555259227254, + "loss": 0.85680556, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.15930176, + "step": 3488, + "time_per_iteration": 2.817223072052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.05141997, + "epoch": 0.671219699884571, + "flos": 553942789632.0, + "grad_norm": 0.07175152498694279, + "language_loss": 0.83673614, + "learning_rate": 0.00025768299420930046, + "loss": 0.84741139, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.16101074, + "step": 3489, + "time_per_iteration": 2.7990496158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070538, + "balance_loss_mlp": 1.05465984, + "epoch": 0.6714120815698346, + "flos": 731508433920.0, + "grad_norm": 0.1191691604479504, + "language_loss": 0.83219582, + "learning_rate": 0.0002574105299021332, + "loss": 0.84290123, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.15869141, + "step": 3490, + "time_per_iteration": 2.943882703781128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070424, + "balance_loss_mlp": 1.05427098, + "epoch": 0.6716044632550981, + "flos": 688664291328.0, + "grad_norm": 0.07146897272940887, + "language_loss": 0.84251606, + "learning_rate": 0.00025713815977655084, + "loss": 0.85322034, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.16149902, + "step": 3491, + "time_per_iteration": 2.896653890609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072686, + "balance_loss_mlp": 1.05612803, + "epoch": 0.6717968449403616, + "flos": 460629752832.0, + "grad_norm": 0.08069380476860724, + "language_loss": 0.84602511, + "learning_rate": 0.0002568658839382969, + "loss": 0.85675204, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.16552734, + "step": 3492, + "time_per_iteration": 2.576414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071874, + "balance_loss_mlp": 1.05595946, + "epoch": 0.6719892266256252, + "flos": 501608360448.0, + "grad_norm": 0.08814414328325225, + "language_loss": 0.84382427, + "learning_rate": 0.00025659370249307814, + "loss": 0.85454303, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.15905762, + "step": 3493, + "time_per_iteration": 2.6682934761047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107054, + "balance_loss_mlp": 1.05422044, + "epoch": 0.6721816083108888, + "flos": 683525081088.0, + "grad_norm": 0.0794608767189563, + "language_loss": 0.852211, + "learning_rate": 0.00025632161554656473, + "loss": 0.86291635, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.16320801, + "step": 3494, + "time_per_iteration": 2.8697800636291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074018, + "balance_loss_mlp": 1.05849671, + "epoch": 0.6723739899961524, + "flos": 585813980160.0, + "grad_norm": 0.07486756079223739, + "language_loss": 0.82214803, + "learning_rate": 0.00025604962320439017, + "loss": 0.83288819, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.1550293, + "step": 3495, + "time_per_iteration": 2.6910951137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071203, + "balance_loss_mlp": 1.0550859, + "epoch": 0.672566371681416, + "flos": 506616519168.0, + "grad_norm": 0.07275570378871335, + "language_loss": 0.82281178, + "learning_rate": 0.0002557777255721516, + "loss": 0.83352381, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.16113281, + "step": 3496, + "time_per_iteration": 2.7872824668884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106895, + "balance_loss_mlp": 1.05311894, + "epoch": 0.6727587533666795, + "flos": 535671055872.0, + "grad_norm": 0.08498246937997968, + "language_loss": 0.80356646, + "learning_rate": 0.0002555059227554087, + "loss": 0.81425595, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.15820312, + "step": 3497, + "time_per_iteration": 2.740039110183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074344, + "balance_loss_mlp": 1.05823898, + "epoch": 0.672951135051943, + "flos": 602832844800.0, + "grad_norm": 0.07033333271672201, + "language_loss": 0.78129387, + "learning_rate": 0.00025523421485968453, + "loss": 0.79203725, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.16088867, + "step": 3498, + "time_per_iteration": 2.835993528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010723, + "balance_loss_mlp": 1.05608737, + "epoch": 0.6731435167372066, + "flos": 811315989504.0, + "grad_norm": 0.0769826573624616, + "language_loss": 0.85233575, + "learning_rate": 0.00025496260199046585, + "loss": 0.86305881, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.16210938, + "step": 3499, + "time_per_iteration": 2.9663631916046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072722, + "balance_loss_mlp": 1.05689144, + "epoch": 0.6733358984224702, + "flos": 611594468352.0, + "grad_norm": 0.06665243506821861, + "language_loss": 0.84537292, + "learning_rate": 0.000254691084253202, + "loss": 0.85610014, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.15820312, + "step": 3500, + "time_per_iteration": 2.8711283206939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069122, + "balance_loss_mlp": 1.05341005, + "epoch": 0.6735282801077337, + "flos": 558901762560.0, + "grad_norm": 0.07241548369558537, + "language_loss": 0.7720896, + "learning_rate": 0.00025441966175330567, + "loss": 0.78278077, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.15698242, + "step": 3501, + "time_per_iteration": 2.6806578636169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075273, + "balance_loss_mlp": 1.0595727, + "epoch": 0.6737206617929973, + "flos": 672433560576.0, + "grad_norm": 0.06443940895189408, + "language_loss": 0.79419506, + "learning_rate": 0.00025414833459615183, + "loss": 0.80494785, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.15686035, + "step": 3502, + "time_per_iteration": 2.8561007976531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073331, + "balance_loss_mlp": 1.05758393, + "epoch": 0.6739130434782609, + "flos": 633446396928.0, + "grad_norm": 0.08973671276033753, + "language_loss": 0.80348676, + "learning_rate": 0.0002538771028870796, + "loss": 0.81422007, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.15734863, + "step": 3503, + "time_per_iteration": 2.8288323879241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070997, + "balance_loss_mlp": 1.05505931, + "epoch": 0.6741054251635245, + "flos": 531445888512.0, + "grad_norm": 0.0592562662752063, + "language_loss": 0.81655902, + "learning_rate": 0.0002536059667313903, + "loss": 0.82726896, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.15930176, + "step": 3504, + "time_per_iteration": 2.7216036319732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065153, + "balance_loss_mlp": 1.04882145, + "epoch": 0.674297806848788, + "flos": 542604220416.0, + "grad_norm": 0.06237409857608865, + "language_loss": 0.89354527, + "learning_rate": 0.0002533349262343483, + "loss": 0.90419674, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.16333008, + "step": 3505, + "time_per_iteration": 2.6963226795196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068691, + "balance_loss_mlp": 1.05269337, + "epoch": 0.6744901885340515, + "flos": 463523129856.0, + "grad_norm": 0.07295709640267785, + "language_loss": 0.82094926, + "learning_rate": 0.0002530639815011807, + "loss": 0.83163619, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.15991211, + "step": 3506, + "time_per_iteration": 2.5332834720611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_mlp": 1.04796219, + "epoch": 0.6746825702193151, + "flos": 631830481920.0, + "grad_norm": 0.07658306531614024, + "language_loss": 0.8492251, + "learning_rate": 0.0002527931326370781, + "loss": 0.85986602, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.16125488, + "step": 3507, + "time_per_iteration": 2.803288459777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106573, + "balance_loss_mlp": 1.04962516, + "epoch": 0.6748749519045787, + "flos": 671146186752.0, + "grad_norm": 0.1018008766550388, + "language_loss": 0.83113879, + "learning_rate": 0.00025252237974719276, + "loss": 0.84179616, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.16101074, + "step": 3508, + "time_per_iteration": 2.8635196685791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.04993796, + "epoch": 0.6750673335898423, + "flos": 767102980608.0, + "grad_norm": 0.07402200210263096, + "language_loss": 0.80262941, + "learning_rate": 0.00025225172293664056, + "loss": 0.81328702, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.15808105, + "step": 3509, + "time_per_iteration": 3.0501134395599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021392, + "balance_loss_mlp": 1.01371539, + "epoch": 0.6752597152751059, + "flos": 1512607675392.0, + "grad_norm": 0.015224351470046544, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77954531, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.07666016, + "step": 3510, + "time_per_iteration": 4.970271825790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067885, + "balance_loss_mlp": 1.05207801, + "epoch": 0.6754520969603693, + "flos": 687297996288.0, + "grad_norm": 0.07849919507555615, + "language_loss": 0.84722501, + "learning_rate": 0.00025171069797381106, + "loss": 0.8579039, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.15795898, + "step": 3511, + "time_per_iteration": 2.851109027862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061123, + "balance_loss_mlp": 1.04545879, + "epoch": 0.6756444786456329, + "flos": 500577947136.0, + "grad_norm": 0.07844004767829481, + "language_loss": 0.81844723, + "learning_rate": 0.00025144033003157864, + "loss": 0.82905853, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.15649414, + "step": 3512, + "time_per_iteration": 2.674426794052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065093, + "balance_loss_mlp": 1.04981041, + "epoch": 0.6758368603308965, + "flos": 492616940544.0, + "grad_norm": 0.07444066278959373, + "language_loss": 0.78994167, + "learning_rate": 0.00025117005858876806, + "loss": 0.80059266, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.15270996, + "step": 3513, + "time_per_iteration": 2.7095823287963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065974, + "balance_loss_mlp": 1.05047631, + "epoch": 0.6760292420161601, + "flos": 555934233600.0, + "grad_norm": 0.07261266754873474, + "language_loss": 0.85087454, + "learning_rate": 0.000250899883750308, + "loss": 0.86153424, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.15490723, + "step": 3514, + "time_per_iteration": 2.7069034576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069973, + "balance_loss_mlp": 1.05441582, + "epoch": 0.6762216237014236, + "flos": 607601668608.0, + "grad_norm": 0.07481063892368622, + "language_loss": 0.81707394, + "learning_rate": 0.00025062980562109006, + "loss": 0.82777369, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.15539551, + "step": 3515, + "time_per_iteration": 2.7273197174072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069966, + "balance_loss_mlp": 1.05483794, + "epoch": 0.6764140053866872, + "flos": 533785697280.0, + "grad_norm": 0.08230462896516925, + "language_loss": 0.82973194, + "learning_rate": 0.0002503598243059677, + "loss": 0.84043157, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.15100098, + "step": 3516, + "time_per_iteration": 2.7930819988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069319, + "balance_loss_mlp": 1.05384541, + "epoch": 0.6766063870719508, + "flos": 504810455040.0, + "grad_norm": 0.07282849285049217, + "language_loss": 0.79984844, + "learning_rate": 0.0002500899399097568, + "loss": 0.81054163, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.15466309, + "step": 3517, + "time_per_iteration": 2.685296058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072996, + "balance_loss_mlp": 1.05737984, + "epoch": 0.6767987687572143, + "flos": 513176726016.0, + "grad_norm": 0.08174636424990783, + "language_loss": 0.85385978, + "learning_rate": 0.0002498201525372359, + "loss": 0.86458969, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.15600586, + "step": 3518, + "time_per_iteration": 2.606057643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076186, + "balance_loss_mlp": 1.06118929, + "epoch": 0.6769911504424779, + "flos": 525039128064.0, + "grad_norm": 0.0782780776412412, + "language_loss": 0.83019435, + "learning_rate": 0.00024955046229314584, + "loss": 0.84095621, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.14978027, + "step": 3519, + "time_per_iteration": 2.6214327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074949, + "balance_loss_mlp": 1.05932105, + "epoch": 0.6771835321277414, + "flos": 449896508928.0, + "grad_norm": 0.06363729030714552, + "language_loss": 0.87326723, + "learning_rate": 0.00024928086928218947, + "loss": 0.88401669, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.15612793, + "step": 3520, + "time_per_iteration": 2.5505292415618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082672, + "balance_loss_mlp": 1.06697249, + "epoch": 0.677375913813005, + "flos": 709349985792.0, + "grad_norm": 0.081240608795973, + "language_loss": 0.75931144, + "learning_rate": 0.00024901137360903216, + "loss": 0.77013814, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.15686035, + "step": 3521, + "time_per_iteration": 2.950173854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108816, + "balance_loss_mlp": 1.07259083, + "epoch": 0.6775682954982686, + "flos": 428420109312.0, + "grad_norm": 0.0718633892564106, + "language_loss": 0.80979002, + "learning_rate": 0.00024874197537830115, + "loss": 0.82067156, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.15551758, + "step": 3522, + "time_per_iteration": 2.5892205238342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090688, + "balance_loss_mlp": 1.07558465, + "epoch": 0.6777606771835322, + "flos": 437905626624.0, + "grad_norm": 0.07642815095579086, + "language_loss": 0.83230734, + "learning_rate": 0.00024847267469458684, + "loss": 0.84321427, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.15087891, + "step": 3523, + "time_per_iteration": 2.5798654556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092549, + "balance_loss_mlp": 1.07682538, + "epoch": 0.6779530588687956, + "flos": 775442087424.0, + "grad_norm": 0.08524244815477096, + "language_loss": 0.77646792, + "learning_rate": 0.00024820347166244034, + "loss": 0.78739345, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.15710449, + "step": 3524, + "time_per_iteration": 3.0167675018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096891, + "balance_loss_mlp": 1.0811317, + "epoch": 0.6781454405540592, + "flos": 571782094848.0, + "grad_norm": 0.06181044738082458, + "language_loss": 0.84706652, + "learning_rate": 0.0002479343663863755, + "loss": 0.85803545, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.1574707, + "step": 3525, + "time_per_iteration": 2.783334255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093963, + "balance_loss_mlp": 1.07813191, + "epoch": 0.6783378222393228, + "flos": 485026693632.0, + "grad_norm": 0.07100418030431462, + "language_loss": 0.76605028, + "learning_rate": 0.00024766535897086876, + "loss": 0.77698994, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.15820312, + "step": 3526, + "time_per_iteration": 2.5780889987945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090687, + "balance_loss_mlp": 1.07472503, + "epoch": 0.6785302039245864, + "flos": 482839958016.0, + "grad_norm": 0.07633985319518004, + "language_loss": 0.79213607, + "learning_rate": 0.0002473964495203578, + "loss": 0.80304295, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.15966797, + "step": 3527, + "time_per_iteration": 2.6945083141326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094522, + "balance_loss_mlp": 1.07897758, + "epoch": 0.67872258560985, + "flos": 524732608512.0, + "grad_norm": 0.0748712356502137, + "language_loss": 0.85111511, + "learning_rate": 0.0002471276381392425, + "loss": 0.86206043, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.15527344, + "step": 3528, + "time_per_iteration": 2.815568208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021348, + "balance_loss_mlp": 1.01443386, + "epoch": 0.6789149672951135, + "flos": 1552605428736.0, + "grad_norm": 0.02081860752447363, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79209983, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.06933594, + "step": 3529, + "time_per_iteration": 5.020832061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090569, + "balance_loss_mlp": 1.07503641, + "epoch": 0.6791073489803771, + "flos": 741406556160.0, + "grad_norm": 0.06927800077395886, + "language_loss": 0.83888638, + "learning_rate": 0.00024659031000260826, + "loss": 0.84979212, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.15515137, + "step": 3530, + "time_per_iteration": 2.9285757541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085207, + "balance_loss_mlp": 1.06901824, + "epoch": 0.6792997306656406, + "flos": 576365538816.0, + "grad_norm": 0.07665905507677669, + "language_loss": 0.80867362, + "learning_rate": 0.0002463217934556985, + "loss": 0.81952572, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.16186523, + "step": 3531, + "time_per_iteration": 2.6685454845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013718, + "balance_loss_mlp": 1.00685167, + "epoch": 0.6794921123509042, + "flos": 1503337273344.0, + "grad_norm": 0.012757439752192143, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.7754581, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.06884766, + "step": 3532, + "time_per_iteration": 4.770366191864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108006, + "balance_loss_mlp": 1.06464624, + "epoch": 0.6796844940361677, + "flos": 698923261440.0, + "grad_norm": 0.08290950297443149, + "language_loss": 0.83307445, + "learning_rate": 0.0002457850559259306, + "loss": 0.84387505, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.15393066, + "step": 3533, + "time_per_iteration": 2.927530527114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084943, + "balance_loss_mlp": 1.06996989, + "epoch": 0.6798768757214313, + "flos": 552759303168.0, + "grad_norm": 0.07118556269002856, + "language_loss": 0.81413895, + "learning_rate": 0.00024551683515145275, + "loss": 0.82498837, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.1496582, + "step": 3534, + "time_per_iteration": 2.7142248153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084982, + "balance_loss_mlp": 1.06973481, + "epoch": 0.6800692574066949, + "flos": 522936456192.0, + "grad_norm": 0.08177709716799147, + "language_loss": 0.86470234, + "learning_rate": 0.0002452487131761014, + "loss": 0.87555218, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.15222168, + "step": 3535, + "time_per_iteration": 2.7160773277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082392, + "balance_loss_mlp": 1.06722808, + "epoch": 0.6802616390919585, + "flos": 574023158784.0, + "grad_norm": 0.08194963503580213, + "language_loss": 0.79881543, + "learning_rate": 0.00024498069010397093, + "loss": 0.80963933, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.15136719, + "step": 3536, + "time_per_iteration": 2.7041375637054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089238, + "balance_loss_mlp": 1.07374132, + "epoch": 0.6804540207772221, + "flos": 488157207552.0, + "grad_norm": 0.06629203560716768, + "language_loss": 0.85175467, + "learning_rate": 0.00024471276603911697, + "loss": 0.862647, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.15478516, + "step": 3537, + "time_per_iteration": 2.6187474727630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087967, + "balance_loss_mlp": 1.07295895, + "epoch": 0.6806464024624855, + "flos": 578594119680.0, + "grad_norm": 0.06748814664215633, + "language_loss": 0.79046237, + "learning_rate": 0.0002444449410855572, + "loss": 0.80134201, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.14990234, + "step": 3538, + "time_per_iteration": 2.7858455181121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086378, + "balance_loss_mlp": 1.07165527, + "epoch": 0.6808387841477491, + "flos": 553722905088.0, + "grad_norm": 0.061176482277740064, + "language_loss": 0.83929294, + "learning_rate": 0.00024417721534727033, + "loss": 0.85015678, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.14697266, + "step": 3539, + "time_per_iteration": 2.651829957962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084901, + "balance_loss_mlp": 1.07020283, + "epoch": 0.6810311658330127, + "flos": 426841270272.0, + "grad_norm": 0.11547680699328156, + "language_loss": 0.83058399, + "learning_rate": 0.00024390958892819687, + "loss": 0.84143305, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.14685059, + "step": 3540, + "time_per_iteration": 2.4877548217773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095795, + "balance_loss_mlp": 1.08085859, + "epoch": 0.6812235475182763, + "flos": 572256368640.0, + "grad_norm": 0.08391920158567873, + "language_loss": 0.80917513, + "learning_rate": 0.0002436420619322381, + "loss": 0.82013303, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.14904785, + "step": 3541, + "time_per_iteration": 2.803321361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098037, + "balance_loss_mlp": 1.08268261, + "epoch": 0.6814159292035398, + "flos": 501917078016.0, + "grad_norm": 0.06869492380097451, + "language_loss": 0.82632923, + "learning_rate": 0.0002433746344632577, + "loss": 0.8373096, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.15332031, + "step": 3542, + "time_per_iteration": 2.6754159927368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084182, + "balance_loss_mlp": 1.06911397, + "epoch": 0.6816083108888034, + "flos": 765531482112.0, + "grad_norm": 0.08922894517327895, + "language_loss": 0.79983473, + "learning_rate": 0.00024310730662508006, + "loss": 0.81067657, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.15039062, + "step": 3543, + "time_per_iteration": 3.0928573608398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088886, + "balance_loss_mlp": 1.07410395, + "epoch": 0.681800692574067, + "flos": 479459824128.0, + "grad_norm": 0.06644129249236999, + "language_loss": 0.87379378, + "learning_rate": 0.0002428400785214911, + "loss": 0.88468266, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.14758301, + "step": 3544, + "time_per_iteration": 2.604871988296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085069, + "balance_loss_mlp": 1.07040668, + "epoch": 0.6819930742593305, + "flos": 691604656128.0, + "grad_norm": 0.06899177555305853, + "language_loss": 0.8269285, + "learning_rate": 0.00024257295025623794, + "loss": 0.83777916, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.14648438, + "step": 3545, + "time_per_iteration": 2.871487617492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087393, + "balance_loss_mlp": 1.07256377, + "epoch": 0.6821854559445941, + "flos": 678096603648.0, + "grad_norm": 0.06856452651542601, + "language_loss": 0.80378067, + "learning_rate": 0.00024230592193302892, + "loss": 0.81465465, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.14807129, + "step": 3546, + "time_per_iteration": 2.9170467853546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082599, + "balance_loss_mlp": 1.06824601, + "epoch": 0.6823778376298576, + "flos": 462191339520.0, + "grad_norm": 0.06947354810539072, + "language_loss": 0.84322613, + "learning_rate": 0.00024203899365553372, + "loss": 0.85405213, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.14343262, + "step": 3547, + "time_per_iteration": 2.5574960708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018037, + "balance_loss_mlp": 1.01088417, + "epoch": 0.6825702193151212, + "flos": 1475298842112.0, + "grad_norm": 0.009467929777895517, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77752393, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.07128906, + "step": 3548, + "time_per_iteration": 4.566468954086304 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081922, + "balance_loss_mlp": 1.06721163, + "epoch": 0.6827626010003848, + "flos": 723114998784.0, + "grad_norm": 0.07355519456276065, + "language_loss": 0.83346403, + "learning_rate": 0.00024150543765216848, + "loss": 0.84428328, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.14697266, + "step": 3549, + "time_per_iteration": 2.9388909339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_mlp": 1.07185233, + "epoch": 0.6829549826856484, + "flos": 558864686592.0, + "grad_norm": 0.27537387522176376, + "language_loss": 0.83331466, + "learning_rate": 0.00024123881013344352, + "loss": 0.84418023, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.14685059, + "step": 3550, + "time_per_iteration": 2.683187484741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090347, + "balance_loss_mlp": 1.07533836, + "epoch": 0.6831473643709118, + "flos": 624934393344.0, + "grad_norm": 0.060306055735835584, + "language_loss": 0.79340541, + "learning_rate": 0.00024097228307472202, + "loss": 0.80430889, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.14990234, + "step": 3551, + "time_per_iteration": 2.844684362411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092152, + "balance_loss_mlp": 1.07727528, + "epoch": 0.6833397460561754, + "flos": 713861849088.0, + "grad_norm": 0.10551739620621807, + "language_loss": 0.82146621, + "learning_rate": 0.00024070585657947846, + "loss": 0.83238769, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.14855957, + "step": 3552, + "time_per_iteration": 2.8860158920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094236, + "balance_loss_mlp": 1.07928681, + "epoch": 0.683532127741439, + "flos": 464704045056.0, + "grad_norm": 0.060639169561421215, + "language_loss": 0.85149372, + "learning_rate": 0.00024043953075114934, + "loss": 0.86243612, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.14941406, + "step": 3553, + "time_per_iteration": 2.677131175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092464, + "balance_loss_mlp": 1.07733643, + "epoch": 0.6837245094267026, + "flos": 582251037696.0, + "grad_norm": 0.09003750211416942, + "language_loss": 0.88845998, + "learning_rate": 0.00024017330569313128, + "loss": 0.89938462, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.15100098, + "step": 3554, + "time_per_iteration": 2.7099804878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089308, + "balance_loss_mlp": 1.07441902, + "epoch": 0.6839168911119662, + "flos": 794173413888.0, + "grad_norm": 0.06693310878195398, + "language_loss": 0.74663389, + "learning_rate": 0.0002399071815087821, + "loss": 0.75752699, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.14855957, + "step": 3555, + "time_per_iteration": 3.0262036323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093717, + "balance_loss_mlp": 1.07882786, + "epoch": 0.6841092727972297, + "flos": 580009973760.0, + "grad_norm": 0.08039204780862134, + "language_loss": 0.8364749, + "learning_rate": 0.00023964115830142025, + "loss": 0.84741211, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.14880371, + "step": 3556, + "time_per_iteration": 2.7107839584350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085439, + "balance_loss_mlp": 1.07070458, + "epoch": 0.6843016544824932, + "flos": 383742738432.0, + "grad_norm": 0.09666419591078326, + "language_loss": 0.87049448, + "learning_rate": 0.00023937523617432522, + "loss": 0.88134885, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.14709473, + "step": 3557, + "time_per_iteration": 2.468167781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082368, + "balance_loss_mlp": 1.06743097, + "epoch": 0.6844940361677568, + "flos": 1439035476480.0, + "grad_norm": 0.09214096887844989, + "language_loss": 0.86638856, + "learning_rate": 0.00023910941523073705, + "loss": 0.87721217, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.14916992, + "step": 3558, + "time_per_iteration": 3.9070351123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081618, + "balance_loss_mlp": 1.06658614, + "epoch": 0.6846864178530204, + "flos": 520870860288.0, + "grad_norm": 0.07061573150035702, + "language_loss": 0.8673026, + "learning_rate": 0.0002388436955738566, + "loss": 0.87811875, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.15002441, + "step": 3559, + "time_per_iteration": 2.6991312503814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079541, + "balance_loss_mlp": 1.06413877, + "epoch": 0.6848787995382839, + "flos": 717946053120.0, + "grad_norm": 0.09206834643003141, + "language_loss": 0.81440175, + "learning_rate": 0.00023857807730684523, + "loss": 0.8251971, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.15380859, + "step": 3560, + "time_per_iteration": 2.8983073234558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081366, + "balance_loss_mlp": 1.06604719, + "epoch": 0.6850711812235475, + "flos": 511061571072.0, + "grad_norm": 0.09081340298912512, + "language_loss": 0.8223151, + "learning_rate": 0.00023831256053282547, + "loss": 0.83312881, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.1529541, + "step": 3561, + "time_per_iteration": 2.7063052654266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080271, + "balance_loss_mlp": 1.06454742, + "epoch": 0.6852635629088111, + "flos": 668151493632.0, + "grad_norm": 0.14586084526989435, + "language_loss": 0.78329659, + "learning_rate": 0.00023804714535488003, + "loss": 0.79409927, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.15710449, + "step": 3562, + "time_per_iteration": 2.9026236534118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012311, + "balance_loss_mlp": 1.00525403, + "epoch": 0.6854559445940747, + "flos": 1522980071424.0, + "grad_norm": 0.007277946766615099, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80821943, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.07080078, + "step": 3563, + "time_per_iteration": 4.994298696517944 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083331, + "balance_loss_mlp": 1.06790555, + "epoch": 0.6856483262793382, + "flos": 454203168768.0, + "grad_norm": 0.0750757790304577, + "language_loss": 0.80663192, + "learning_rate": 0.00023751662019934488, + "loss": 0.81746531, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.1541748, + "step": 3564, + "time_per_iteration": 2.6247575283050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087229, + "balance_loss_mlp": 1.07223213, + "epoch": 0.6858407079646017, + "flos": 615552763392.0, + "grad_norm": 0.06101613558394618, + "language_loss": 0.79368252, + "learning_rate": 0.00023725151042772364, + "loss": 0.80455482, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.1496582, + "step": 3565, + "time_per_iteration": 2.733064651489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.07328987, + "epoch": 0.6860330896498653, + "flos": 466053087744.0, + "grad_norm": 0.06639135766618469, + "language_loss": 0.83069307, + "learning_rate": 0.00023698650266411276, + "loss": 0.8415767, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.1505127, + "step": 3566, + "time_per_iteration": 2.6350362300872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091248, + "balance_loss_mlp": 1.07638311, + "epoch": 0.6862254713351289, + "flos": 864270425088.0, + "grad_norm": 0.08738258373054857, + "language_loss": 0.83224273, + "learning_rate": 0.00023672159701139755, + "loss": 0.84315515, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.14831543, + "step": 3567, + "time_per_iteration": 3.2377495765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093919, + "balance_loss_mlp": 1.07899451, + "epoch": 0.6864178530203925, + "flos": 447141523968.0, + "grad_norm": 0.08115353052200597, + "language_loss": 0.86123919, + "learning_rate": 0.00023645679357242296, + "loss": 0.87217844, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.14904785, + "step": 3568, + "time_per_iteration": 2.5912718772888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085209, + "balance_loss_mlp": 1.07000983, + "epoch": 0.6866102347056561, + "flos": 424269093888.0, + "grad_norm": 0.0774589822263595, + "language_loss": 0.84057611, + "learning_rate": 0.00023619209244999534, + "loss": 0.85142827, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.1517334, + "step": 3569, + "time_per_iteration": 2.5609703063964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088355, + "balance_loss_mlp": 1.07344151, + "epoch": 0.6868026163909196, + "flos": 472373586432.0, + "grad_norm": 0.09034321435408287, + "language_loss": 0.84892517, + "learning_rate": 0.0002359274937468806, + "loss": 0.85980874, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.14904785, + "step": 3570, + "time_per_iteration": 2.5558407306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088136, + "balance_loss_mlp": 1.07292545, + "epoch": 0.6869949980761831, + "flos": 464190124032.0, + "grad_norm": 0.06600150491897518, + "language_loss": 0.78017968, + "learning_rate": 0.00023566299756580512, + "loss": 0.79106104, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.15185547, + "step": 3571, + "time_per_iteration": 2.6505472660064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094782, + "balance_loss_mlp": 1.07961917, + "epoch": 0.6871873797614467, + "flos": 426235944960.0, + "grad_norm": 0.08793371373837118, + "language_loss": 0.78414327, + "learning_rate": 0.0002353986040094551, + "loss": 0.79509115, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.15136719, + "step": 3572, + "time_per_iteration": 2.510256290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093503, + "balance_loss_mlp": 1.07853007, + "epoch": 0.6873797614467103, + "flos": 443625569280.0, + "grad_norm": 0.08501423170750884, + "language_loss": 0.79296732, + "learning_rate": 0.00023513431318047796, + "loss": 0.80390239, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.14953613, + "step": 3573, + "time_per_iteration": 2.5400164127349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086278, + "balance_loss_mlp": 1.07101965, + "epoch": 0.6875721431319738, + "flos": 992323436544.0, + "grad_norm": 0.07288870578041759, + "language_loss": 0.76573622, + "learning_rate": 0.00023487012518147977, + "loss": 0.77659905, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.15234375, + "step": 3574, + "time_per_iteration": 3.248400926589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084828, + "balance_loss_mlp": 1.06955671, + "epoch": 0.6877645248172374, + "flos": 1285513638912.0, + "grad_norm": 0.0698790191488345, + "language_loss": 0.84093738, + "learning_rate": 0.00023460604011502772, + "loss": 0.85178566, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.15258789, + "step": 3575, + "time_per_iteration": 3.6553032398223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085647, + "balance_loss_mlp": 1.07138944, + "epoch": 0.687956906502501, + "flos": 876733383168.0, + "grad_norm": 0.0800354404876214, + "language_loss": 0.85504699, + "learning_rate": 0.00023434205808364845, + "loss": 0.8659035, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.1427002, + "step": 3576, + "time_per_iteration": 3.173497200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094726, + "balance_loss_mlp": 1.07970524, + "epoch": 0.6881492881877646, + "flos": 563324419584.0, + "grad_norm": 0.07355938881939053, + "language_loss": 0.8520726, + "learning_rate": 0.00023407817918982932, + "loss": 0.86301988, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.14990234, + "step": 3577, + "time_per_iteration": 2.810378313064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094391, + "balance_loss_mlp": 1.07960927, + "epoch": 0.6883416698730281, + "flos": 795127104000.0, + "grad_norm": 0.06289078804693891, + "language_loss": 0.78850877, + "learning_rate": 0.00023381440353601718, + "loss": 0.79945266, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.14758301, + "step": 3578, + "time_per_iteration": 3.0247299671173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091227, + "balance_loss_mlp": 1.07633758, + "epoch": 0.6885340515582916, + "flos": 723621579264.0, + "grad_norm": 0.07119192926976899, + "language_loss": 0.85820395, + "learning_rate": 0.00023355073122461822, + "loss": 0.86911619, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.14868164, + "step": 3579, + "time_per_iteration": 2.90800404548645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094317, + "balance_loss_mlp": 1.07949877, + "epoch": 0.6887264332435552, + "flos": 1010926282752.0, + "grad_norm": 0.07022836851030782, + "language_loss": 0.82529831, + "learning_rate": 0.00023328716235799973, + "loss": 0.83624148, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.14782715, + "step": 3580, + "time_per_iteration": 3.300455331802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100506, + "balance_loss_mlp": 1.08599877, + "epoch": 0.6889188149288188, + "flos": 585262983168.0, + "grad_norm": 0.08437878588236032, + "language_loss": 0.8341161, + "learning_rate": 0.00023302369703848803, + "loss": 0.84512115, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.14489746, + "step": 3581, + "time_per_iteration": 2.6898550987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098143, + "balance_loss_mlp": 1.08326566, + "epoch": 0.6891111966140824, + "flos": 636119889408.0, + "grad_norm": 0.08155365941467911, + "language_loss": 0.80103743, + "learning_rate": 0.00023276033536836937, + "loss": 0.81201887, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.14868164, + "step": 3582, + "time_per_iteration": 2.7915916442871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109, + "balance_loss_mlp": 1.07499123, + "epoch": 0.6893035782993459, + "flos": 495270609408.0, + "grad_norm": 0.0619697140217233, + "language_loss": 0.84551424, + "learning_rate": 0.00023249707744988984, + "loss": 0.8564142, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.14990234, + "step": 3583, + "time_per_iteration": 2.659757375717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092214, + "balance_loss_mlp": 1.07747972, + "epoch": 0.6894959599846094, + "flos": 458215792128.0, + "grad_norm": 0.08143589972695583, + "language_loss": 0.82035959, + "learning_rate": 0.00023223392338525529, + "loss": 0.83128172, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.1472168, + "step": 3584, + "time_per_iteration": 2.5301597118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094389, + "balance_loss_mlp": 1.07959485, + "epoch": 0.689688341669873, + "flos": 505003175424.0, + "grad_norm": 0.21421019470066024, + "language_loss": 0.78488421, + "learning_rate": 0.00023197087327663107, + "loss": 0.79582822, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.14770508, + "step": 3585, + "time_per_iteration": 2.679481029510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096698, + "balance_loss_mlp": 1.08208311, + "epoch": 0.6898807233551366, + "flos": 763910797824.0, + "grad_norm": 0.06326504558768707, + "language_loss": 0.81044286, + "learning_rate": 0.00023170792722614243, + "loss": 0.82140982, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.14599609, + "step": 3586, + "time_per_iteration": 2.9200220108032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099186, + "balance_loss_mlp": 1.08436847, + "epoch": 0.6900731050404002, + "flos": 583337977344.0, + "grad_norm": 0.05947736637449061, + "language_loss": 0.83560526, + "learning_rate": 0.00023144508533587377, + "loss": 0.84659708, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.14819336, + "step": 3587, + "time_per_iteration": 2.8857367038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098984, + "balance_loss_mlp": 1.08429766, + "epoch": 0.6902654867256637, + "flos": 711865262592.0, + "grad_norm": 0.08877001768581633, + "language_loss": 0.78586876, + "learning_rate": 0.0002311823477078698, + "loss": 0.79685855, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.14660645, + "step": 3588, + "time_per_iteration": 2.9328413009643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107388, + "balance_loss_mlp": 1.09263027, + "epoch": 0.6904578684109273, + "flos": 597112902144.0, + "grad_norm": 0.06868681048998228, + "language_loss": 0.85218358, + "learning_rate": 0.00023091971444413428, + "loss": 0.86325753, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.14733887, + "step": 3589, + "time_per_iteration": 2.8086462020874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104804, + "balance_loss_mlp": 1.09003377, + "epoch": 0.6906502500961909, + "flos": 585040527360.0, + "grad_norm": 0.06776060090181614, + "language_loss": 0.82496858, + "learning_rate": 0.00023065718564663012, + "loss": 0.83601665, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.14733887, + "step": 3590, + "time_per_iteration": 2.7810418605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_mlp": 1.0248282, + "epoch": 0.6908426317814544, + "flos": 1587827017728.0, + "grad_norm": 0.01280069102087921, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.7494362, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.07177734, + "step": 3591, + "time_per_iteration": 5.021310806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098984, + "balance_loss_mlp": 1.08440506, + "epoch": 0.6910350134667179, + "flos": 500780579328.0, + "grad_norm": 0.06380479300315355, + "language_loss": 0.81160456, + "learning_rate": 0.0002301324418579666, + "loss": 0.8225944, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.14562988, + "step": 3592, + "time_per_iteration": 2.7419848442077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_mlp": 1.02446389, + "epoch": 0.6912273951519815, + "flos": 1409194257408.0, + "grad_norm": 0.013008866579229384, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79719996, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.07080078, + "step": 3593, + "time_per_iteration": 4.783770322799683 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101571, + "balance_loss_mlp": 1.08720624, + "epoch": 0.6914197768372451, + "flos": 635279625216.0, + "grad_norm": 0.07461713066007468, + "language_loss": 0.80640858, + "learning_rate": 0.00022960811715677415, + "loss": 0.8174243, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.14355469, + "step": 3594, + "time_per_iteration": 2.8687844276428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101393, + "balance_loss_mlp": 1.08688569, + "epoch": 0.6916121585225087, + "flos": 558044246016.0, + "grad_norm": 0.06773480939306162, + "language_loss": 0.81380737, + "learning_rate": 0.00022934611221845608, + "loss": 0.82482135, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.14489746, + "step": 3595, + "time_per_iteration": 2.876573085784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098147, + "balance_loss_mlp": 1.08306754, + "epoch": 0.6918045402077723, + "flos": 529167748608.0, + "grad_norm": 0.07714844100639354, + "language_loss": 0.78139538, + "learning_rate": 0.00022908421235729609, + "loss": 0.79237688, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.1505127, + "step": 3596, + "time_per_iteration": 2.758575439453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.07603967, + "epoch": 0.6919969218930357, + "flos": 570351559680.0, + "grad_norm": 0.07010095160576196, + "language_loss": 0.85004246, + "learning_rate": 0.0002288224176749728, + "loss": 0.86095226, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.14904785, + "step": 3597, + "time_per_iteration": 2.6715195178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103476, + "balance_loss_mlp": 1.08851576, + "epoch": 0.6921893035782993, + "flos": 683305196544.0, + "grad_norm": 0.08507252289690358, + "language_loss": 0.78096193, + "learning_rate": 0.00022856072827312385, + "loss": 0.79199672, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.14929199, + "step": 3598, + "time_per_iteration": 2.8153061866760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086452, + "balance_loss_mlp": 1.07144332, + "epoch": 0.6923816852635629, + "flos": 546745324032.0, + "grad_norm": 0.09179482408325199, + "language_loss": 0.76836538, + "learning_rate": 0.00022829914425334598, + "loss": 0.77922994, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.14978027, + "step": 3599, + "time_per_iteration": 2.6517763137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090108, + "balance_loss_mlp": 1.07546949, + "epoch": 0.6925740669488265, + "flos": 510036300288.0, + "grad_norm": 0.06988561333174233, + "language_loss": 0.80617976, + "learning_rate": 0.0002280376657171956, + "loss": 0.8170808, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.14624023, + "step": 3600, + "time_per_iteration": 2.668285369873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090912, + "balance_loss_mlp": 1.075737, + "epoch": 0.69276644863409, + "flos": 869424689664.0, + "grad_norm": 0.0699308267355068, + "language_loss": 0.76665217, + "learning_rate": 0.00022777629276618706, + "loss": 0.77756131, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.1517334, + "step": 3601, + "time_per_iteration": 3.14390230178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108857, + "balance_loss_mlp": 1.07352614, + "epoch": 0.6929588303193536, + "flos": 625772086272.0, + "grad_norm": 0.07480870376538759, + "language_loss": 0.77635819, + "learning_rate": 0.0002275150255017947, + "loss": 0.78724384, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.15039062, + "step": 3602, + "time_per_iteration": 2.8169686794281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012655, + "balance_loss_mlp": 1.00578892, + "epoch": 0.6931512120046172, + "flos": 1545382996992.0, + "grad_norm": 0.008701900485553249, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76745325, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.06884766, + "step": 3603, + "time_per_iteration": 5.027594327926636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011913, + "balance_loss_mlp": 1.00504613, + "epoch": 0.6933435936898807, + "flos": 1448230606848.0, + "grad_norm": 0.008216841516263335, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76139021, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.06884766, + "step": 3604, + "time_per_iteration": 4.732091426849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082938, + "balance_loss_mlp": 1.0677743, + "epoch": 0.6935359753751443, + "flos": 540896901120.0, + "grad_norm": 0.10647094637473072, + "language_loss": 0.84698266, + "learning_rate": 0.0002267318588424379, + "loss": 0.85781205, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.15161133, + "step": 3605, + "time_per_iteration": 2.6778976917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081609, + "balance_loss_mlp": 1.06657648, + "epoch": 0.6937283570604078, + "flos": 719396411904.0, + "grad_norm": 0.06584839695977855, + "language_loss": 0.87588215, + "learning_rate": 0.00022647101533842845, + "loss": 0.88669825, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.15002441, + "step": 3606, + "time_per_iteration": 2.8986434936523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080072, + "balance_loss_mlp": 1.06539774, + "epoch": 0.6939207387456714, + "flos": 522165574656.0, + "grad_norm": 0.07095695288657847, + "language_loss": 0.76177275, + "learning_rate": 0.00022621027802778872, + "loss": 0.77257347, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.14660645, + "step": 3607, + "time_per_iteration": 2.68804931640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.06613147, + "epoch": 0.694113120430935, + "flos": 535359767040.0, + "grad_norm": 0.08196461827358215, + "language_loss": 0.78305531, + "learning_rate": 0.00022594964701174586, + "loss": 0.79386854, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.1517334, + "step": 3608, + "time_per_iteration": 2.6762053966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086087, + "balance_loss_mlp": 1.07163918, + "epoch": 0.6943055021161986, + "flos": 523358972928.0, + "grad_norm": 0.08367512296737743, + "language_loss": 0.84715855, + "learning_rate": 0.00022568912239148586, + "loss": 0.85801935, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.14416504, + "step": 3609, + "time_per_iteration": 2.7212107181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080855, + "balance_loss_mlp": 1.06584692, + "epoch": 0.694497883801462, + "flos": 484902982656.0, + "grad_norm": 0.07445866768245664, + "language_loss": 0.81263393, + "learning_rate": 0.00022542870426815344, + "loss": 0.82344246, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.14990234, + "step": 3610, + "time_per_iteration": 2.733375072479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010749, + "balance_loss_mlp": 1.06001055, + "epoch": 0.6946902654867256, + "flos": 461474786304.0, + "grad_norm": 0.07362557272852362, + "language_loss": 0.86188352, + "learning_rate": 0.00022516839274285173, + "loss": 0.8726325, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.14880371, + "step": 3611, + "time_per_iteration": 2.5730910301208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078598, + "balance_loss_mlp": 1.06367326, + "epoch": 0.6948826471719892, + "flos": 512855525376.0, + "grad_norm": 0.07586635796694485, + "language_loss": 0.75025129, + "learning_rate": 0.00022490818791664265, + "loss": 0.76103735, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.14892578, + "step": 3612, + "time_per_iteration": 2.6340460777282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081262, + "balance_loss_mlp": 1.06613493, + "epoch": 0.6950750288572528, + "flos": 557184531456.0, + "grad_norm": 0.07004728730886566, + "language_loss": 0.855506, + "learning_rate": 0.00022464808989054676, + "loss": 0.8663187, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.15100098, + "step": 3613, + "time_per_iteration": 2.691323757171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_mlp": 1.06455255, + "epoch": 0.6952674105425164, + "flos": 542475740160.0, + "grad_norm": 0.07439341927968558, + "language_loss": 0.76007962, + "learning_rate": 0.00022438809876554284, + "loss": 0.77087599, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.15063477, + "step": 3614, + "time_per_iteration": 2.6413824558258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083746, + "balance_loss_mlp": 1.06851149, + "epoch": 0.6954597922277799, + "flos": 546742752768.0, + "grad_norm": 0.07752478508749527, + "language_loss": 0.80230355, + "learning_rate": 0.00022412821464256873, + "loss": 0.81314099, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.15209961, + "step": 3615, + "time_per_iteration": 2.697600841522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085665, + "balance_loss_mlp": 1.07031107, + "epoch": 0.6956521739130435, + "flos": 519511905792.0, + "grad_norm": 0.07699011833004216, + "language_loss": 0.82032132, + "learning_rate": 0.00022386843762252023, + "loss": 0.83117795, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.15332031, + "step": 3616, + "time_per_iteration": 2.6330502033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089945, + "balance_loss_mlp": 1.0750314, + "epoch": 0.695844555598307, + "flos": 466275543552.0, + "grad_norm": 0.09639318919512468, + "language_loss": 0.79538012, + "learning_rate": 0.00022360876780625193, + "loss": 0.80627954, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.14880371, + "step": 3617, + "time_per_iteration": 2.582629680633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079925, + "balance_loss_mlp": 1.06488085, + "epoch": 0.6960369372835706, + "flos": 600663361536.0, + "grad_norm": 0.056274852551945066, + "language_loss": 0.80103874, + "learning_rate": 0.00022334920529457604, + "loss": 0.81183803, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.15026855, + "step": 3618, + "time_per_iteration": 2.936511754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091126, + "balance_loss_mlp": 1.07581925, + "epoch": 0.6962293189688342, + "flos": 644233969152.0, + "grad_norm": 0.07050393221618255, + "language_loss": 0.87297118, + "learning_rate": 0.00022308975018826423, + "loss": 0.8838824, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.15283203, + "step": 3619, + "time_per_iteration": 2.8777477741241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086314, + "balance_loss_mlp": 1.07101941, + "epoch": 0.6964217006540977, + "flos": 638810634240.0, + "grad_norm": 0.06699510138661768, + "language_loss": 0.84512174, + "learning_rate": 0.00022283040258804564, + "loss": 0.85598493, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.15270996, + "step": 3620, + "time_per_iteration": 2.7737884521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082599, + "balance_loss_mlp": 1.0671612, + "epoch": 0.6966140823393613, + "flos": 652167811584.0, + "grad_norm": 0.06929377823135867, + "language_loss": 0.83519012, + "learning_rate": 0.00022257116259460802, + "loss": 0.84601611, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.15429688, + "step": 3621, + "time_per_iteration": 2.904534101486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081028, + "balance_loss_mlp": 1.06585217, + "epoch": 0.6968064640246249, + "flos": 704492328960.0, + "grad_norm": 0.06749965217673044, + "language_loss": 0.81476158, + "learning_rate": 0.00022231203030859725, + "loss": 0.82557189, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.15148926, + "step": 3622, + "time_per_iteration": 2.979004144668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087747, + "balance_loss_mlp": 1.0728699, + "epoch": 0.6969988457098885, + "flos": 492555271680.0, + "grad_norm": 0.10955891307443118, + "language_loss": 0.83551806, + "learning_rate": 0.00022205300583061737, + "loss": 0.84639549, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.14855957, + "step": 3623, + "time_per_iteration": 2.5939643383026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101565, + "balance_loss_mlp": 1.00821149, + "epoch": 0.6971912273951519, + "flos": 1352592442368.0, + "grad_norm": 0.01064219692859378, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83853853, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.07421875, + "step": 3624, + "time_per_iteration": 4.894463539123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084586, + "balance_loss_mlp": 1.0696255, + "epoch": 0.6973836090804155, + "flos": 602459887104.0, + "grad_norm": 0.08187690915242911, + "language_loss": 0.77176243, + "learning_rate": 0.00022153528070095735, + "loss": 0.78260833, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.14941406, + "step": 3625, + "time_per_iteration": 2.740964651107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082128, + "balance_loss_mlp": 1.06735802, + "epoch": 0.6975759907656791, + "flos": 524065614336.0, + "grad_norm": 0.07883351153063048, + "language_loss": 0.87864482, + "learning_rate": 0.00022127658025027568, + "loss": 0.88946617, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.14758301, + "step": 3626, + "time_per_iteration": 2.694669723510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081274, + "balance_loss_mlp": 1.06603932, + "epoch": 0.6977683724509427, + "flos": 480912754176.0, + "grad_norm": 0.2474524571141355, + "language_loss": 0.84912121, + "learning_rate": 0.00022101798800962258, + "loss": 0.85993397, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.15209961, + "step": 3627, + "time_per_iteration": 2.6004905700683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082967, + "balance_loss_mlp": 1.06798291, + "epoch": 0.6979607541362063, + "flos": 522625167360.0, + "grad_norm": 0.07660061174433377, + "language_loss": 0.7872625, + "learning_rate": 0.00022075950407939227, + "loss": 0.79809219, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.14978027, + "step": 3628, + "time_per_iteration": 2.6326966285705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090002, + "balance_loss_mlp": 1.07531548, + "epoch": 0.6981531358214698, + "flos": 548077114368.0, + "grad_norm": 0.0701967106904507, + "language_loss": 0.82905591, + "learning_rate": 0.0002205011285599367, + "loss": 0.83995599, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.14672852, + "step": 3629, + "time_per_iteration": 2.639697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091118, + "balance_loss_mlp": 1.07614517, + "epoch": 0.6983455175067333, + "flos": 700052419584.0, + "grad_norm": 0.06279315859884016, + "language_loss": 0.80564064, + "learning_rate": 0.00022024286155156658, + "loss": 0.8165518, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.1496582, + "step": 3630, + "time_per_iteration": 2.8495450019836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108414, + "balance_loss_mlp": 1.06932223, + "epoch": 0.6985378991919969, + "flos": 485078450688.0, + "grad_norm": 0.06855398456951894, + "language_loss": 0.85904682, + "learning_rate": 0.00021998470315454994, + "loss": 0.86988831, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.14794922, + "step": 3631, + "time_per_iteration": 2.67564058303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088432, + "balance_loss_mlp": 1.07317352, + "epoch": 0.6987302808772605, + "flos": 558780622848.0, + "grad_norm": 0.0636105841757025, + "language_loss": 0.86414385, + "learning_rate": 0.00021972665346911275, + "loss": 0.87502813, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.15234375, + "step": 3632, + "time_per_iteration": 2.7555418014526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095235, + "balance_loss_mlp": 1.0801785, + "epoch": 0.698922662562524, + "flos": 483593587200.0, + "grad_norm": 0.07511038810381725, + "language_loss": 0.79825956, + "learning_rate": 0.00021946871259543877, + "loss": 0.80921185, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.15026855, + "step": 3633, + "time_per_iteration": 2.633380651473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092543, + "balance_loss_mlp": 1.07767808, + "epoch": 0.6991150442477876, + "flos": 718909655040.0, + "grad_norm": 0.09690309084755197, + "language_loss": 0.82919359, + "learning_rate": 0.00021921088063366957, + "loss": 0.840119, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.14831543, + "step": 3634, + "time_per_iteration": 2.9242210388183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092511, + "balance_loss_mlp": 1.0774194, + "epoch": 0.6993074259330512, + "flos": 489128150016.0, + "grad_norm": 0.0639201840368843, + "language_loss": 0.81773442, + "learning_rate": 0.00021895315768390435, + "loss": 0.82865953, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.15063477, + "step": 3635, + "time_per_iteration": 2.6489744186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095535, + "balance_loss_mlp": 1.0803355, + "epoch": 0.6994998076183148, + "flos": 718089214464.0, + "grad_norm": 0.060489852807190235, + "language_loss": 0.87983084, + "learning_rate": 0.00021869554384619999, + "loss": 0.89078617, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.15185547, + "step": 3636, + "time_per_iteration": 3.0024359226226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090766, + "balance_loss_mlp": 1.07566249, + "epoch": 0.6996921893035783, + "flos": 579016636416.0, + "grad_norm": 0.08372148054785959, + "language_loss": 0.80742836, + "learning_rate": 0.00021843803922057115, + "loss": 0.81833601, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.15075684, + "step": 3637, + "time_per_iteration": 2.7597806453704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099553, + "balance_loss_mlp": 1.08446145, + "epoch": 0.6998845709888418, + "flos": 518629796352.0, + "grad_norm": 0.07613673241424718, + "language_loss": 0.81840616, + "learning_rate": 0.00021818064390698977, + "loss": 0.82940167, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.15075684, + "step": 3638, + "time_per_iteration": 2.662210702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097696, + "balance_loss_mlp": 1.08278298, + "epoch": 0.7000769526741054, + "flos": 620951505408.0, + "grad_norm": 0.0762563380177704, + "language_loss": 0.86943358, + "learning_rate": 0.0002179233580053861, + "loss": 0.88041055, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.14892578, + "step": 3639, + "time_per_iteration": 2.76003098487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088923, + "balance_loss_mlp": 1.0736047, + "epoch": 0.700269334359369, + "flos": 559946856960.0, + "grad_norm": 0.06684483131763276, + "language_loss": 0.85643017, + "learning_rate": 0.00021766618161564688, + "loss": 0.86731935, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.1529541, + "step": 3640, + "time_per_iteration": 2.710590362548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089129, + "balance_loss_mlp": 1.07377481, + "epoch": 0.7004617160446326, + "flos": 483343967232.0, + "grad_norm": 0.08652937172490481, + "language_loss": 0.87291199, + "learning_rate": 0.00021740911483761677, + "loss": 0.88380325, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.15344238, + "step": 3641, + "time_per_iteration": 2.587820529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108697, + "balance_loss_mlp": 1.0719738, + "epoch": 0.7006540977298961, + "flos": 696981003264.0, + "grad_norm": 0.05890602185122373, + "language_loss": 0.92162985, + "learning_rate": 0.00021715215777109837, + "loss": 0.93249953, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.14978027, + "step": 3642, + "time_per_iteration": 2.9837920665740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087066, + "balance_loss_mlp": 1.07216477, + "epoch": 0.7008464794151597, + "flos": 504775950336.0, + "grad_norm": 0.0660113097105393, + "language_loss": 0.84073913, + "learning_rate": 0.00021689531051585103, + "loss": 0.85160977, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.14904785, + "step": 3643, + "time_per_iteration": 2.5994443893432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083136, + "balance_loss_mlp": 1.06766284, + "epoch": 0.7010388611004232, + "flos": 537242554368.0, + "grad_norm": 0.08730620791306808, + "language_loss": 0.80473441, + "learning_rate": 0.00021663857317159196, + "loss": 0.81556571, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.15454102, + "step": 3644, + "time_per_iteration": 2.636361837387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089327, + "balance_loss_mlp": 1.07431817, + "epoch": 0.7012312427856868, + "flos": 547259245056.0, + "grad_norm": 0.07432760793631779, + "language_loss": 0.82087952, + "learning_rate": 0.00021638194583799487, + "loss": 0.8317728, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.14978027, + "step": 3645, + "time_per_iteration": 2.697885513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082052, + "balance_loss_mlp": 1.06686449, + "epoch": 0.7014236244709504, + "flos": 941409630720.0, + "grad_norm": 0.07667470628550804, + "language_loss": 0.82340956, + "learning_rate": 0.00021612542861469176, + "loss": 0.83423007, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.15185547, + "step": 3646, + "time_per_iteration": 3.2449471950531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075293, + "balance_loss_mlp": 1.06002283, + "epoch": 0.7016160061562139, + "flos": 525167608320.0, + "grad_norm": 0.08774418992406267, + "language_loss": 0.82529956, + "learning_rate": 0.00021586902160127135, + "loss": 0.83605254, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.15246582, + "step": 3647, + "time_per_iteration": 2.6226844787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086185, + "balance_loss_mlp": 1.07074714, + "epoch": 0.7018083878414775, + "flos": 373385023488.0, + "grad_norm": 0.12454789428341487, + "language_loss": 0.73784959, + "learning_rate": 0.00021561272489727974, + "loss": 0.74871147, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.15429688, + "step": 3648, + "time_per_iteration": 2.445005178451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088543, + "balance_loss_mlp": 1.07320118, + "epoch": 0.7020007695267411, + "flos": 527784201216.0, + "grad_norm": 0.07345436564624129, + "language_loss": 0.79976106, + "learning_rate": 0.0002153565386022199, + "loss": 0.81064653, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.15332031, + "step": 3649, + "time_per_iteration": 2.656079053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107984, + "balance_loss_mlp": 1.06450915, + "epoch": 0.7021931512120047, + "flos": 690154297344.0, + "grad_norm": 0.07891543981767615, + "language_loss": 0.82497263, + "learning_rate": 0.00021510046281555262, + "loss": 0.83577102, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.15307617, + "step": 3650, + "time_per_iteration": 2.8389041423797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108619, + "balance_loss_mlp": 1.0712415, + "epoch": 0.7023855328972681, + "flos": 639784147968.0, + "grad_norm": 0.08322895667729725, + "language_loss": 0.82151127, + "learning_rate": 0.0002148444976366949, + "loss": 0.83237314, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.14929199, + "step": 3651, + "time_per_iteration": 2.7878010272979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088068, + "balance_loss_mlp": 1.07308304, + "epoch": 0.7025779145825317, + "flos": 560940194304.0, + "grad_norm": 0.09064059041024937, + "language_loss": 0.82483077, + "learning_rate": 0.00021458864316502136, + "loss": 0.83571148, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.1496582, + "step": 3652, + "time_per_iteration": 2.7618918418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081972, + "balance_loss_mlp": 1.06670094, + "epoch": 0.7027702962677953, + "flos": 447445472256.0, + "grad_norm": 0.07081207687484876, + "language_loss": 0.87084836, + "learning_rate": 0.0002143328994998634, + "loss": 0.88166809, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.15258789, + "step": 3653, + "time_per_iteration": 2.510607957839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082487, + "balance_loss_mlp": 1.06681085, + "epoch": 0.7029626779530589, + "flos": 622500609024.0, + "grad_norm": 0.07138431513844615, + "language_loss": 0.78192198, + "learning_rate": 0.00021407726674050982, + "loss": 0.7927469, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.15661621, + "step": 3654, + "time_per_iteration": 2.917117118835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087015, + "balance_loss_mlp": 1.07145858, + "epoch": 0.7031550596383225, + "flos": 629591989248.0, + "grad_norm": 0.05934864829913755, + "language_loss": 0.87179619, + "learning_rate": 0.0002138217449862061, + "loss": 0.88266635, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.15539551, + "step": 3655, + "time_per_iteration": 2.756298542022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079642, + "balance_loss_mlp": 1.06409764, + "epoch": 0.703347441323586, + "flos": 530843134464.0, + "grad_norm": 0.05984331693080437, + "language_loss": 0.78077435, + "learning_rate": 0.00021356633433615403, + "loss": 0.79157078, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.15527344, + "step": 3656, + "time_per_iteration": 2.5978493690490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107548, + "balance_loss_mlp": 1.06014955, + "epoch": 0.7035398230088495, + "flos": 693593528832.0, + "grad_norm": 0.058151360566504745, + "language_loss": 0.83692706, + "learning_rate": 0.0002133110348895133, + "loss": 0.84768182, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.15307617, + "step": 3657, + "time_per_iteration": 2.959099769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.06172252, + "epoch": 0.7037322046941131, + "flos": 968035152384.0, + "grad_norm": 0.06222478003101834, + "language_loss": 0.84750932, + "learning_rate": 0.0002130558467453999, + "loss": 0.85828042, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.15368652, + "step": 3658, + "time_per_iteration": 3.370732545852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078328, + "balance_loss_mlp": 1.0626049, + "epoch": 0.7039245863793767, + "flos": 502863427584.0, + "grad_norm": 0.06594992598251542, + "language_loss": 0.84501821, + "learning_rate": 0.0002128007700028865, + "loss": 0.85580146, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.15710449, + "step": 3659, + "time_per_iteration": 2.7245702743530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069448, + "balance_loss_mlp": 1.05422533, + "epoch": 0.7041169680646402, + "flos": 465954342912.0, + "grad_norm": 0.08946749020423889, + "language_loss": 0.84478891, + "learning_rate": 0.00021254580476100276, + "loss": 0.85548341, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.15209961, + "step": 3660, + "time_per_iteration": 2.5659265518188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074702, + "balance_loss_mlp": 1.05915749, + "epoch": 0.7043093497499038, + "flos": 632181417984.0, + "grad_norm": 0.06878141726007914, + "language_loss": 0.78906703, + "learning_rate": 0.00021229095111873497, + "loss": 0.79981405, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.15527344, + "step": 3661, + "time_per_iteration": 2.76365327835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064044, + "balance_loss_mlp": 1.04833269, + "epoch": 0.7045017314351674, + "flos": 542930190336.0, + "grad_norm": 0.07470763388511147, + "language_loss": 0.86035001, + "learning_rate": 0.0002120362091750261, + "loss": 0.87099046, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.15698242, + "step": 3662, + "time_per_iteration": 2.770721197128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072624, + "balance_loss_mlp": 1.05705488, + "epoch": 0.704694113120431, + "flos": 428237300736.0, + "grad_norm": 0.09281609686454934, + "language_loss": 0.87091279, + "learning_rate": 0.00021178157902877566, + "loss": 0.881639, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.15551758, + "step": 3663, + "time_per_iteration": 2.5207157135009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066204, + "balance_loss_mlp": 1.05053949, + "epoch": 0.7048864948056945, + "flos": 650544556032.0, + "grad_norm": 0.09122726068806429, + "language_loss": 0.86906433, + "learning_rate": 0.0002115270607788397, + "loss": 0.87972641, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.15661621, + "step": 3664, + "time_per_iteration": 2.7837629318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066813, + "balance_loss_mlp": 1.05149484, + "epoch": 0.705078876490958, + "flos": 412562336256.0, + "grad_norm": 0.07041201506359947, + "language_loss": 0.85376197, + "learning_rate": 0.00021127265452403133, + "loss": 0.86443013, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.15307617, + "step": 3665, + "time_per_iteration": 2.5382044315338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036955, + "balance_loss_mlp": 1.02923036, + "epoch": 0.7052712581762216, + "flos": 1420040927232.0, + "grad_norm": 0.02225212280598243, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85128582, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.07714844, + "step": 3666, + "time_per_iteration": 4.855606317520142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065613, + "balance_loss_mlp": 1.04975796, + "epoch": 0.7054636398614852, + "flos": 493049369088.0, + "grad_norm": 0.07067932110848238, + "language_loss": 0.82453668, + "learning_rate": 0.00021076417839483065, + "loss": 0.8351928, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.15844727, + "step": 3667, + "time_per_iteration": 2.784029960632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063329, + "balance_loss_mlp": 1.04703355, + "epoch": 0.7056560215467488, + "flos": 450457417728.0, + "grad_norm": 0.06070170414255382, + "language_loss": 0.84920627, + "learning_rate": 0.00021051010871784589, + "loss": 0.85983962, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.16296387, + "step": 3668, + "time_per_iteration": 2.5824825763702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061908, + "balance_loss_mlp": 1.04632711, + "epoch": 0.7058484032320124, + "flos": 565703875584.0, + "grad_norm": 0.06560783943853528, + "language_loss": 0.7931717, + "learning_rate": 0.0002102561514308045, + "loss": 0.80379081, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.15563965, + "step": 3669, + "time_per_iteration": 2.754573345184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064666, + "balance_loss_mlp": 1.04950261, + "epoch": 0.7060407849172758, + "flos": 567008501760.0, + "grad_norm": 0.0735599697631235, + "language_loss": 0.82317781, + "learning_rate": 0.00021000230663230135, + "loss": 0.83382452, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.15148926, + "step": 3670, + "time_per_iteration": 2.7335312366485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057554, + "balance_loss_mlp": 1.04233122, + "epoch": 0.7062331666025394, + "flos": 468746403840.0, + "grad_norm": 0.08649275144444013, + "language_loss": 0.82978272, + "learning_rate": 0.00020974857442088762, + "loss": 0.84035832, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.15197754, + "step": 3671, + "time_per_iteration": 2.5915567874908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062839, + "balance_loss_mlp": 1.04766369, + "epoch": 0.706425548287803, + "flos": 595316749824.0, + "grad_norm": 0.13981263765851287, + "language_loss": 0.88996911, + "learning_rate": 0.00020949495489507104, + "loss": 0.90059757, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.15148926, + "step": 3672, + "time_per_iteration": 2.679868459701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.05241179, + "epoch": 0.7066179299730666, + "flos": 475815389184.0, + "grad_norm": 0.08084311033907006, + "language_loss": 0.84611428, + "learning_rate": 0.00020924144815331525, + "loss": 0.85678977, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.15124512, + "step": 3673, + "time_per_iteration": 2.5828816890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066376, + "balance_loss_mlp": 1.05085516, + "epoch": 0.7068103116583301, + "flos": 506409117696.0, + "grad_norm": 0.07749570003659609, + "language_loss": 0.83121467, + "learning_rate": 0.00020898805429404044, + "loss": 0.84187841, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.1550293, + "step": 3674, + "time_per_iteration": 2.6209206581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.05180216, + "epoch": 0.7070026933435937, + "flos": 679336989696.0, + "grad_norm": 0.08324875322502746, + "language_loss": 0.78500605, + "learning_rate": 0.0002087347734156228, + "loss": 0.79567671, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.15234375, + "step": 3675, + "time_per_iteration": 2.8822014331817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065667, + "balance_loss_mlp": 1.05075419, + "epoch": 0.7071950750288573, + "flos": 472217942016.0, + "grad_norm": 0.07260496319451265, + "language_loss": 0.79725403, + "learning_rate": 0.00020848160561639452, + "loss": 0.80791068, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.14904785, + "step": 3676, + "time_per_iteration": 2.6594197750091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067782, + "balance_loss_mlp": 1.05267811, + "epoch": 0.7073874567141208, + "flos": 473742452736.0, + "grad_norm": 0.07068166110728066, + "language_loss": 0.86114025, + "learning_rate": 0.0002082285509946445, + "loss": 0.87181818, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.15087891, + "step": 3677, + "time_per_iteration": 2.607058525085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071327, + "balance_loss_mlp": 1.05596066, + "epoch": 0.7075798383993844, + "flos": 545877895680.0, + "grad_norm": 0.07275047066851685, + "language_loss": 0.83149093, + "learning_rate": 0.00020797560964861683, + "loss": 0.84220415, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.15344238, + "step": 3678, + "time_per_iteration": 2.7852048873901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107009, + "balance_loss_mlp": 1.05486727, + "epoch": 0.7077722200846479, + "flos": 662090526720.0, + "grad_norm": 0.18523634613836037, + "language_loss": 0.80623943, + "learning_rate": 0.0002077227816765122, + "loss": 0.81694031, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.15197754, + "step": 3679, + "time_per_iteration": 3.0609920024871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020725, + "balance_loss_mlp": 1.01333392, + "epoch": 0.7079646017699115, + "flos": 1529960223744.0, + "grad_norm": 0.017379131221616127, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77468443, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.07373047, + "step": 3680, + "time_per_iteration": 4.8490447998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062109, + "balance_loss_mlp": 1.04688621, + "epoch": 0.7081569834551751, + "flos": 621502502400.0, + "grad_norm": 0.06739909135454819, + "language_loss": 0.78692472, + "learning_rate": 0.00020721746624665383, + "loss": 0.79754579, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.15197754, + "step": 3681, + "time_per_iteration": 2.73974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106588, + "balance_loss_mlp": 1.05075181, + "epoch": 0.7083493651404387, + "flos": 794630435328.0, + "grad_norm": 0.061911135339539125, + "language_loss": 0.80153, + "learning_rate": 0.00020696497898508114, + "loss": 0.8121888, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.15100098, + "step": 3682, + "time_per_iteration": 3.0617854595184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066388, + "balance_loss_mlp": 1.050915, + "epoch": 0.7085417468257021, + "flos": 813747202560.0, + "grad_norm": 0.0963650064314711, + "language_loss": 0.77652842, + "learning_rate": 0.00020671260548979316, + "loss": 0.78719234, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.15454102, + "step": 3683, + "time_per_iteration": 3.0387070178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070402, + "balance_loss_mlp": 1.05497599, + "epoch": 0.7087341285109657, + "flos": 700566340608.0, + "grad_norm": 0.07537323093447403, + "language_loss": 0.85192174, + "learning_rate": 0.00020646034585876982, + "loss": 0.86262578, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.1541748, + "step": 3684, + "time_per_iteration": 2.8481369018554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067395, + "balance_loss_mlp": 1.05154002, + "epoch": 0.7089265101962293, + "flos": 596514917376.0, + "grad_norm": 0.07238379528702499, + "language_loss": 0.84238535, + "learning_rate": 0.00020620820018994718, + "loss": 0.85305929, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.15844727, + "step": 3685, + "time_per_iteration": 2.850832462310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070547, + "balance_loss_mlp": 1.05496585, + "epoch": 0.7091188918814929, + "flos": 487106970624.0, + "grad_norm": 0.08929254247711407, + "language_loss": 0.82788908, + "learning_rate": 0.00020595616858121675, + "loss": 0.83859456, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.15563965, + "step": 3686, + "time_per_iteration": 2.6979212760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069674, + "balance_loss_mlp": 1.05395079, + "epoch": 0.7093112735667565, + "flos": 600117507072.0, + "grad_norm": 0.06683720470711539, + "language_loss": 0.80962205, + "learning_rate": 0.00020570425113042586, + "loss": 0.82031882, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.15710449, + "step": 3687, + "time_per_iteration": 2.781977415084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073663, + "balance_loss_mlp": 1.05815399, + "epoch": 0.70950365525202, + "flos": 505830956544.0, + "grad_norm": 0.08176203647633842, + "language_loss": 0.85817683, + "learning_rate": 0.0002054524479353776, + "loss": 0.86891353, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.1550293, + "step": 3688, + "time_per_iteration": 2.7264649868011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107422, + "balance_loss_mlp": 1.05829346, + "epoch": 0.7096960369372836, + "flos": 732160747008.0, + "grad_norm": 0.07836614397127288, + "language_loss": 0.81732869, + "learning_rate": 0.00020520075909383063, + "loss": 0.82807088, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.15917969, + "step": 3689, + "time_per_iteration": 2.8794634342193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074673, + "balance_loss_mlp": 1.05887747, + "epoch": 0.7098884186225471, + "flos": 972077511168.0, + "grad_norm": 0.06452769831785021, + "language_loss": 0.80728209, + "learning_rate": 0.00020494918470349916, + "loss": 0.81802881, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.15783691, + "step": 3690, + "time_per_iteration": 3.310556173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073435, + "balance_loss_mlp": 1.05783021, + "epoch": 0.7100808003078107, + "flos": 504252117504.0, + "grad_norm": 0.07986210521804603, + "language_loss": 0.85468179, + "learning_rate": 0.00020469772486205297, + "loss": 0.86541611, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.15588379, + "step": 3691, + "time_per_iteration": 2.6372458934783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073806, + "balance_loss_mlp": 1.05890524, + "epoch": 0.7102731819930742, + "flos": 540335992320.0, + "grad_norm": 0.0774052521314589, + "language_loss": 0.80950189, + "learning_rate": 0.0002044463796671177, + "loss": 0.82023996, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.14880371, + "step": 3692, + "time_per_iteration": 2.7093636989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076355, + "balance_loss_mlp": 1.06070268, + "epoch": 0.7104655636783378, + "flos": 620378113536.0, + "grad_norm": 0.09666696589873951, + "language_loss": 0.80422229, + "learning_rate": 0.00020419514921627408, + "loss": 0.81498581, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.15649414, + "step": 3693, + "time_per_iteration": 2.8826653957366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076233, + "balance_loss_mlp": 1.06092691, + "epoch": 0.7106579453636014, + "flos": 557322923520.0, + "grad_norm": 0.09593640635946206, + "language_loss": 0.77400964, + "learning_rate": 0.00020394403360705855, + "loss": 0.78477204, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.15283203, + "step": 3694, + "time_per_iteration": 2.711338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073674, + "balance_loss_mlp": 1.05866575, + "epoch": 0.710850327048865, + "flos": 513048245760.0, + "grad_norm": 0.07513000367190234, + "language_loss": 0.87831378, + "learning_rate": 0.00020369303293696228, + "loss": 0.88905054, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.15002441, + "step": 3695, + "time_per_iteration": 2.6499857902526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076787, + "balance_loss_mlp": 1.06118226, + "epoch": 0.7110427087341286, + "flos": 423619352064.0, + "grad_norm": 0.09128032628418083, + "language_loss": 0.78371423, + "learning_rate": 0.00020344214730343304, + "loss": 0.79448211, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.15588379, + "step": 3696, + "time_per_iteration": 2.6158998012542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.05185723, + "epoch": 0.711235090419392, + "flos": 577415402496.0, + "grad_norm": 0.06490931607854103, + "language_loss": 0.79312873, + "learning_rate": 0.00020319137680387296, + "loss": 0.80379784, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.15039062, + "step": 3697, + "time_per_iteration": 2.949493646621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068106, + "balance_loss_mlp": 1.05243063, + "epoch": 0.7114274721046556, + "flos": 448060709376.0, + "grad_norm": 0.07559912966503037, + "language_loss": 0.80551994, + "learning_rate": 0.0002029407215356398, + "loss": 0.81620097, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.15673828, + "step": 3698, + "time_per_iteration": 2.5261340141296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070696, + "balance_loss_mlp": 1.0556761, + "epoch": 0.7116198537899192, + "flos": 621962095104.0, + "grad_norm": 0.07643567713665894, + "language_loss": 0.83342177, + "learning_rate": 0.00020269018159604663, + "loss": 0.84412873, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.15002441, + "step": 3699, + "time_per_iteration": 2.7795286178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069252, + "balance_loss_mlp": 1.05404091, + "epoch": 0.7118122354751828, + "flos": 498724895232.0, + "grad_norm": 0.06553173563730097, + "language_loss": 0.82171476, + "learning_rate": 0.00020243975708236162, + "loss": 0.83240736, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.15197754, + "step": 3700, + "time_per_iteration": 2.603534698486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066013, + "balance_loss_mlp": 1.05129027, + "epoch": 0.7120046171604463, + "flos": 572718532608.0, + "grad_norm": 0.07521702556055786, + "language_loss": 0.86229205, + "learning_rate": 0.00020218944809180818, + "loss": 0.87295222, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.14709473, + "step": 3701, + "time_per_iteration": 2.7194745540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072931, + "balance_loss_mlp": 1.05825663, + "epoch": 0.7121969988457099, + "flos": 572664204288.0, + "grad_norm": 0.06709763936599906, + "language_loss": 0.84454715, + "learning_rate": 0.00020193925472156493, + "loss": 0.85527647, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.14648438, + "step": 3702, + "time_per_iteration": 2.6942999362945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008682, + "balance_loss_mlp": 1.00162458, + "epoch": 0.7123893805309734, + "flos": 1523429752320.0, + "grad_norm": 0.007804959242713824, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75297856, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.07080078, + "step": 3703, + "time_per_iteration": 4.9340033531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073126, + "balance_loss_mlp": 1.05801082, + "epoch": 0.712581762216237, + "flos": 615105280512.0, + "grad_norm": 0.05932039995937275, + "language_loss": 0.83487558, + "learning_rate": 0.00020143921523049863, + "loss": 0.8456068, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.15087891, + "step": 3704, + "time_per_iteration": 2.9551632404327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069307, + "balance_loss_mlp": 1.05464458, + "epoch": 0.7127741439015006, + "flos": 597777698304.0, + "grad_norm": 0.08724240459453055, + "language_loss": 0.84004354, + "learning_rate": 0.00020118936930380837, + "loss": 0.85073662, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.14648438, + "step": 3705, + "time_per_iteration": 2.7111411094665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076471, + "balance_loss_mlp": 1.06118834, + "epoch": 0.7129665255867641, + "flos": 537398198784.0, + "grad_norm": 0.07870920964767068, + "language_loss": 0.81005669, + "learning_rate": 0.0002009396393856932, + "loss": 0.8208214, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.15258789, + "step": 3706, + "time_per_iteration": 2.6393184661865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066656, + "balance_loss_mlp": 1.05216026, + "epoch": 0.7131589072720277, + "flos": 526442499072.0, + "grad_norm": 0.07318964523896145, + "language_loss": 0.82600415, + "learning_rate": 0.00020069002557310673, + "loss": 0.83667076, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.14489746, + "step": 3707, + "time_per_iteration": 2.6772639751434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072653, + "balance_loss_mlp": 1.05814552, + "epoch": 0.7133512889572913, + "flos": 530919484416.0, + "grad_norm": 0.07177417053669936, + "language_loss": 0.76892489, + "learning_rate": 0.00020044052796295807, + "loss": 0.77965146, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.14501953, + "step": 3708, + "time_per_iteration": 2.8213729858398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068946, + "balance_loss_mlp": 1.05386651, + "epoch": 0.7135436706425549, + "flos": 503535564288.0, + "grad_norm": 0.08204040588858591, + "language_loss": 0.81975353, + "learning_rate": 0.00020019114665211063, + "loss": 0.83044302, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.1505127, + "step": 3709, + "time_per_iteration": 2.622581958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069621, + "balance_loss_mlp": 1.0548985, + "epoch": 0.7137360523278183, + "flos": 515968786944.0, + "grad_norm": 0.1809650911769107, + "language_loss": 0.81334156, + "learning_rate": 0.00019994188173738276, + "loss": 0.82403779, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.14697266, + "step": 3710, + "time_per_iteration": 2.6591386795043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070639, + "balance_loss_mlp": 1.05586886, + "epoch": 0.7139284340130819, + "flos": 510389434368.0, + "grad_norm": 0.07384437980034154, + "language_loss": 0.80407298, + "learning_rate": 0.0001996927333155477, + "loss": 0.81477934, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.14758301, + "step": 3711, + "time_per_iteration": 2.8079118728637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075523, + "balance_loss_mlp": 1.06068099, + "epoch": 0.7141208156983455, + "flos": 890275940352.0, + "grad_norm": 0.06892114468166954, + "language_loss": 0.85343927, + "learning_rate": 0.00019944370148333346, + "loss": 0.86419451, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.14819336, + "step": 3712, + "time_per_iteration": 3.1857290267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072063, + "balance_loss_mlp": 1.0572927, + "epoch": 0.7143131973836091, + "flos": 535779712512.0, + "grad_norm": 0.07489369079916172, + "language_loss": 0.79687518, + "learning_rate": 0.00019919478633742278, + "loss": 0.80759573, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.14758301, + "step": 3713, + "time_per_iteration": 2.705082416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077145, + "balance_loss_mlp": 1.06194544, + "epoch": 0.7145055790688727, + "flos": 473668300800.0, + "grad_norm": 0.08783705919644806, + "language_loss": 0.85156208, + "learning_rate": 0.00019894598797445302, + "loss": 0.86233354, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.15185547, + "step": 3714, + "time_per_iteration": 2.5540032386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072516, + "balance_loss_mlp": 1.05732846, + "epoch": 0.7146979607541362, + "flos": 570521885184.0, + "grad_norm": 0.06443194669340387, + "language_loss": 0.81776547, + "learning_rate": 0.00019869730649101615, + "loss": 0.82849067, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.15161133, + "step": 3715, + "time_per_iteration": 2.811156988143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074394, + "balance_loss_mlp": 1.05909991, + "epoch": 0.7148903424393998, + "flos": 839666082816.0, + "grad_norm": 0.07982240605965638, + "language_loss": 0.72529298, + "learning_rate": 0.00019844874198365943, + "loss": 0.7360369, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.15283203, + "step": 3716, + "time_per_iteration": 3.1387200355529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076276, + "balance_loss_mlp": 1.06136334, + "epoch": 0.7150827241246633, + "flos": 541823427072.0, + "grad_norm": 0.09017082219564719, + "language_loss": 0.83709008, + "learning_rate": 0.00019820029454888362, + "loss": 0.84785283, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.14892578, + "step": 3717, + "time_per_iteration": 2.7234127521514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022761, + "balance_loss_mlp": 1.01575112, + "epoch": 0.7152751058099269, + "flos": 1583678200320.0, + "grad_norm": 0.012614936180071102, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75544029, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.0703125, + "step": 3718, + "time_per_iteration": 5.082587957382202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079514, + "balance_loss_mlp": 1.06449401, + "epoch": 0.7154674874951905, + "flos": 517419145728.0, + "grad_norm": 0.07146981792263798, + "language_loss": 0.80162418, + "learning_rate": 0.0001977037512828529, + "loss": 0.8124193, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.15002441, + "step": 3719, + "time_per_iteration": 2.6214728355407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071959, + "balance_loss_mlp": 1.05687928, + "epoch": 0.715659869180454, + "flos": 602524127232.0, + "grad_norm": 0.0719921548875284, + "language_loss": 0.86400878, + "learning_rate": 0.0001974556556443734, + "loss": 0.87472844, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.15063477, + "step": 3720, + "time_per_iteration": 2.7185661792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071639, + "balance_loss_mlp": 1.05692816, + "epoch": 0.7158522508657176, + "flos": 531675684864.0, + "grad_norm": 0.10794401503038722, + "language_loss": 0.88869536, + "learning_rate": 0.00019720767746402547, + "loss": 0.89941168, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.14685059, + "step": 3721, + "time_per_iteration": 2.7171661853790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077737, + "balance_loss_mlp": 1.06312251, + "epoch": 0.7160446325509812, + "flos": 557569972224.0, + "grad_norm": 0.06715510904090914, + "language_loss": 0.7994473, + "learning_rate": 0.00019695981683808222, + "loss": 0.81022465, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.14599609, + "step": 3722, + "time_per_iteration": 2.764094114303589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.06211424, + "epoch": 0.7162370142362448, + "flos": 690986847744.0, + "grad_norm": 0.0719125731951098, + "language_loss": 0.84857029, + "learning_rate": 0.00019671207386277225, + "loss": 0.85933566, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.14404297, + "step": 3723, + "time_per_iteration": 3.001659870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079748, + "balance_loss_mlp": 1.06515729, + "epoch": 0.7164293959215082, + "flos": 794109173760.0, + "grad_norm": 0.06669181204662188, + "language_loss": 0.78279907, + "learning_rate": 0.0001964644486342777, + "loss": 0.79359657, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.14575195, + "step": 3724, + "time_per_iteration": 2.9778544902801514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081993, + "balance_loss_mlp": 1.06744969, + "epoch": 0.7166217776067718, + "flos": 494178527232.0, + "grad_norm": 0.0857275082292459, + "language_loss": 0.862409, + "learning_rate": 0.00019621694124873524, + "loss": 0.87322897, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.14526367, + "step": 3725, + "time_per_iteration": 2.704180955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019771, + "balance_loss_mlp": 1.0125227, + "epoch": 0.7168141592920354, + "flos": 1401060354048.0, + "grad_norm": 0.010100997712727341, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77559853, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.07226562, + "step": 3726, + "time_per_iteration": 4.889356374740601 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081675, + "balance_loss_mlp": 1.06740522, + "epoch": 0.717006540977299, + "flos": 793150341120.0, + "grad_norm": 0.06067860958569485, + "language_loss": 0.77179575, + "learning_rate": 0.00019572228039082428, + "loss": 0.7826125, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.14257812, + "step": 3727, + "time_per_iteration": 3.0806260108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086686, + "balance_loss_mlp": 1.07246482, + "epoch": 0.7171989226625626, + "flos": 554812416000.0, + "grad_norm": 0.11517752889227628, + "language_loss": 0.83454174, + "learning_rate": 0.0001954751271105002, + "loss": 0.84540862, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.14221191, + "step": 3728, + "time_per_iteration": 2.8201711177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090041, + "balance_loss_mlp": 1.07510376, + "epoch": 0.717391304347826, + "flos": 555914409984.0, + "grad_norm": 0.0783907674353494, + "language_loss": 0.80835211, + "learning_rate": 0.00019522809205721687, + "loss": 0.81925255, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.14904785, + "step": 3729, + "time_per_iteration": 2.7735860347747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086368, + "balance_loss_mlp": 1.07193196, + "epoch": 0.7175836860330896, + "flos": 538855898112.0, + "grad_norm": 0.0782422692062248, + "language_loss": 0.82922757, + "learning_rate": 0.0001949811753268816, + "loss": 0.84009123, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.14428711, + "step": 3730, + "time_per_iteration": 2.7340402603149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085427, + "balance_loss_mlp": 1.07047808, + "epoch": 0.7177760677183532, + "flos": 515637674496.0, + "grad_norm": 0.07822041527126099, + "language_loss": 0.82415104, + "learning_rate": 0.00019473437701535634, + "loss": 0.83500528, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.14929199, + "step": 3731, + "time_per_iteration": 2.6087753772735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077018, + "balance_loss_mlp": 1.06260514, + "epoch": 0.7179684494036168, + "flos": 674719041024.0, + "grad_norm": 0.09315520299322393, + "language_loss": 0.89131868, + "learning_rate": 0.00019448769721845677, + "loss": 0.90208888, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.1439209, + "step": 3732, + "time_per_iteration": 2.836510419845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077686, + "balance_loss_mlp": 1.06293976, + "epoch": 0.7181608310888803, + "flos": 469912637952.0, + "grad_norm": 0.09025148051517691, + "language_loss": 0.85745353, + "learning_rate": 0.00019424113603195203, + "loss": 0.86823046, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.1472168, + "step": 3733, + "time_per_iteration": 2.562520742416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079742, + "balance_loss_mlp": 1.06485271, + "epoch": 0.7183532127741439, + "flos": 593952652800.0, + "grad_norm": 0.07835269792198636, + "language_loss": 0.80024004, + "learning_rate": 0.0001939946935515657, + "loss": 0.81103742, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.14868164, + "step": 3734, + "time_per_iteration": 2.836775302886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.05774188, + "epoch": 0.7185455944594075, + "flos": 498917615616.0, + "grad_norm": 0.12420836308345841, + "language_loss": 0.80785656, + "learning_rate": 0.0001937483698729755, + "loss": 0.81858528, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.15100098, + "step": 3735, + "time_per_iteration": 2.63600492477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071914, + "balance_loss_mlp": 1.05714417, + "epoch": 0.718737976144671, + "flos": 814933260288.0, + "grad_norm": 0.06842150185792192, + "language_loss": 0.82507128, + "learning_rate": 0.0001935021650918128, + "loss": 0.8357904, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.14758301, + "step": 3736, + "time_per_iteration": 3.00943922996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068199, + "balance_loss_mlp": 1.0535481, + "epoch": 0.7189303578299346, + "flos": 438328143360.0, + "grad_norm": 0.07910633337871513, + "language_loss": 0.86689806, + "learning_rate": 0.0001932560793036625, + "loss": 0.87758005, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.14624023, + "step": 3737, + "time_per_iteration": 2.5100209712982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071415, + "balance_loss_mlp": 1.05637121, + "epoch": 0.7191227395151981, + "flos": 549398992896.0, + "grad_norm": 0.07360308333676036, + "language_loss": 0.86295319, + "learning_rate": 0.00019301011260406382, + "loss": 0.87366736, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.15014648, + "step": 3738, + "time_per_iteration": 2.6612305641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066115, + "balance_loss_mlp": 1.05066597, + "epoch": 0.7193151212004617, + "flos": 626938320384.0, + "grad_norm": 0.06504656569076563, + "language_loss": 0.79763281, + "learning_rate": 0.00019276426508850936, + "loss": 0.80829394, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.15429688, + "step": 3739, + "time_per_iteration": 2.7507288455963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068994, + "balance_loss_mlp": 1.05356801, + "epoch": 0.7195075028857253, + "flos": 741062960640.0, + "grad_norm": 0.14081168877709307, + "language_loss": 0.80209506, + "learning_rate": 0.00019251853685244564, + "loss": 0.81278491, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.15405273, + "step": 3740, + "time_per_iteration": 3.0117878913879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066123, + "balance_loss_mlp": 1.05085278, + "epoch": 0.7196998845709889, + "flos": 802875566592.0, + "grad_norm": 0.09880671887971038, + "language_loss": 0.80556595, + "learning_rate": 0.00019227292799127283, + "loss": 0.8162272, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.15258789, + "step": 3741, + "time_per_iteration": 3.026409864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064138, + "balance_loss_mlp": 1.04933214, + "epoch": 0.7198922662562524, + "flos": 925183669248.0, + "grad_norm": 0.07716038295803591, + "language_loss": 0.79115927, + "learning_rate": 0.00019202743860034454, + "loss": 0.80180067, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.14770508, + "step": 3742, + "time_per_iteration": 3.2409439086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_mlp": 1.04710603, + "epoch": 0.7200846479415159, + "flos": 580111289856.0, + "grad_norm": 0.0865048699099666, + "language_loss": 0.8386541, + "learning_rate": 0.00019178206877496873, + "loss": 0.84927607, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.15075684, + "step": 3743, + "time_per_iteration": 2.7031853199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065388, + "balance_loss_mlp": 1.05033231, + "epoch": 0.7202770296267795, + "flos": 557695881216.0, + "grad_norm": 0.06660391987267253, + "language_loss": 0.85197371, + "learning_rate": 0.0001915368186104059, + "loss": 0.86262763, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.15026855, + "step": 3744, + "time_per_iteration": 2.80344557762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067637, + "balance_loss_mlp": 1.05193746, + "epoch": 0.7204694113120431, + "flos": 672552129024.0, + "grad_norm": 0.07605590282722621, + "language_loss": 0.8109616, + "learning_rate": 0.0001912916882018706, + "loss": 0.82163799, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.15698242, + "step": 3745, + "time_per_iteration": 2.8043081760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073912, + "balance_loss_mlp": 1.05809283, + "epoch": 0.7206617929973067, + "flos": 799194055680.0, + "grad_norm": 0.09426618368019588, + "language_loss": 0.79127324, + "learning_rate": 0.00019104667764453125, + "loss": 0.80201232, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.15808105, + "step": 3746, + "time_per_iteration": 3.0704562664031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067683, + "balance_loss_mlp": 1.0524838, + "epoch": 0.7208541746825702, + "flos": 531898140672.0, + "grad_norm": 0.06643820747478134, + "language_loss": 0.8021549, + "learning_rate": 0.00019080178703350926, + "loss": 0.81283176, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.1517334, + "step": 3747, + "time_per_iteration": 2.68495774269104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068891, + "balance_loss_mlp": 1.05280995, + "epoch": 0.7210465563678338, + "flos": 535139882496.0, + "grad_norm": 0.0742282179981503, + "language_loss": 0.8279261, + "learning_rate": 0.00019055701646387952, + "loss": 0.838615, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.1607666, + "step": 3748, + "time_per_iteration": 2.6640145778656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028096, + "balance_loss_mlp": 1.02113438, + "epoch": 0.7212389380530974, + "flos": 1533908606976.0, + "grad_norm": 0.025188249902834022, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81500781, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.06982422, + "step": 3749, + "time_per_iteration": 4.826270341873169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066097, + "balance_loss_mlp": 1.05093408, + "epoch": 0.7214313197383609, + "flos": 461511862272.0, + "grad_norm": 0.08049000033963269, + "language_loss": 0.86480904, + "learning_rate": 0.00019006783582886368, + "loss": 0.87546998, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.15136719, + "step": 3750, + "time_per_iteration": 2.621736526489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_mlp": 1.05215693, + "epoch": 0.7216237014236244, + "flos": 1037134056960.0, + "grad_norm": 0.08524695909851505, + "language_loss": 0.82916629, + "learning_rate": 0.00018982342595339437, + "loss": 0.83984083, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.15270996, + "step": 3751, + "time_per_iteration": 3.483065128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070357, + "balance_loss_mlp": 1.05556357, + "epoch": 0.721816083108888, + "flos": 895951466496.0, + "grad_norm": 0.06727789695466473, + "language_loss": 0.82144976, + "learning_rate": 0.00018957913649915076, + "loss": 0.83215332, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.14770508, + "step": 3752, + "time_per_iteration": 3.1644365787506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072387, + "balance_loss_mlp": 1.05725896, + "epoch": 0.7220084647941516, + "flos": 523314556416.0, + "grad_norm": 0.07729245448911205, + "language_loss": 0.79620636, + "learning_rate": 0.00018933496756097428, + "loss": 0.80693024, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.15100098, + "step": 3753, + "time_per_iteration": 2.620807409286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072735, + "balance_loss_mlp": 1.05732155, + "epoch": 0.7222008464794152, + "flos": 816099494400.0, + "grad_norm": 0.5805538149813421, + "language_loss": 0.81562132, + "learning_rate": 0.0001890909192336603, + "loss": 0.8263486, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.15393066, + "step": 3754, + "time_per_iteration": 3.042017936706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078212, + "balance_loss_mlp": 1.06300032, + "epoch": 0.7223932281646788, + "flos": 749053702656.0, + "grad_norm": 0.0713648645371922, + "language_loss": 0.70115459, + "learning_rate": 0.00018884699161195623, + "loss": 0.71193671, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.15185547, + "step": 3755, + "time_per_iteration": 2.9720511436462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076125, + "balance_loss_mlp": 1.06104493, + "epoch": 0.7225856098499422, + "flos": 745502870016.0, + "grad_norm": 0.09493040514567173, + "language_loss": 0.77216029, + "learning_rate": 0.00018860318479056327, + "loss": 0.78292155, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.15075684, + "step": 3756, + "time_per_iteration": 3.119727373123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083813, + "balance_loss_mlp": 1.06946039, + "epoch": 0.7227779915352058, + "flos": 547330825728.0, + "grad_norm": 0.0825815003753041, + "language_loss": 0.83252132, + "learning_rate": 0.00018835949886413555, + "loss": 0.84335947, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.14343262, + "step": 3757, + "time_per_iteration": 2.767611026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080172, + "balance_loss_mlp": 1.06541348, + "epoch": 0.7229703732204694, + "flos": 530484857856.0, + "grad_norm": 0.07604080274562658, + "language_loss": 0.7847476, + "learning_rate": 0.0001881159339272806, + "loss": 0.79554933, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.14733887, + "step": 3758, + "time_per_iteration": 2.644622325897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086486, + "balance_loss_mlp": 1.07175171, + "epoch": 0.723162754905733, + "flos": 528355021824.0, + "grad_norm": 0.07134654052906102, + "language_loss": 0.78514063, + "learning_rate": 0.00018787249007455858, + "loss": 0.79600549, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.14709473, + "step": 3759, + "time_per_iteration": 2.613767147064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089448, + "balance_loss_mlp": 1.07513046, + "epoch": 0.7233551365909965, + "flos": 654868468224.0, + "grad_norm": 0.07096105030949329, + "language_loss": 0.71290004, + "learning_rate": 0.00018762916740048302, + "loss": 0.72379452, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.14318848, + "step": 3760, + "time_per_iteration": 2.822312355041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010919, + "balance_loss_mlp": 1.07746363, + "epoch": 0.7235475182762601, + "flos": 522365635584.0, + "grad_norm": 0.060444894943140336, + "language_loss": 0.85770047, + "learning_rate": 0.0001873859659995195, + "loss": 0.86861944, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.14428711, + "step": 3761, + "time_per_iteration": 2.7546091079711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096949, + "balance_loss_mlp": 1.08265626, + "epoch": 0.7237398999615237, + "flos": 609170595840.0, + "grad_norm": 0.0683412355594852, + "language_loss": 0.83724195, + "learning_rate": 0.0001871428859660878, + "loss": 0.84821141, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.14282227, + "step": 3762, + "time_per_iteration": 2.770059823989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099122, + "balance_loss_mlp": 1.08488798, + "epoch": 0.7239322816467872, + "flos": 658987176960.0, + "grad_norm": 0.08191796316314504, + "language_loss": 0.82060403, + "learning_rate": 0.00018689992739455975, + "loss": 0.8315953, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.14233398, + "step": 3763, + "time_per_iteration": 2.929271697998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092957, + "balance_loss_mlp": 1.07871115, + "epoch": 0.7241246633320508, + "flos": 969282878976.0, + "grad_norm": 0.06346083155179776, + "language_loss": 0.85959136, + "learning_rate": 0.00018665709037926027, + "loss": 0.87052089, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.14257812, + "step": 3764, + "time_per_iteration": 3.3369805812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099745, + "balance_loss_mlp": 1.08588123, + "epoch": 0.7243170450173143, + "flos": 514995273216.0, + "grad_norm": 0.08806284436028786, + "language_loss": 0.84687865, + "learning_rate": 0.00018641437501446694, + "loss": 0.85787606, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.13867188, + "step": 3765, + "time_per_iteration": 2.622209072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096599, + "balance_loss_mlp": 1.08235359, + "epoch": 0.7245094267025779, + "flos": 559746796032.0, + "grad_norm": 0.07635972593652277, + "language_loss": 0.82246089, + "learning_rate": 0.0001861717813944104, + "loss": 0.83342695, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.14257812, + "step": 3766, + "time_per_iteration": 2.6759207248687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095414, + "balance_loss_mlp": 1.08095431, + "epoch": 0.7247018083878415, + "flos": 612642134016.0, + "grad_norm": 0.0797588387433463, + "language_loss": 0.79539496, + "learning_rate": 0.00018592930961327365, + "loss": 0.8063491, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.14440918, + "step": 3767, + "time_per_iteration": 2.7272777557373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109367, + "balance_loss_mlp": 1.07921004, + "epoch": 0.7248941900731051, + "flos": 634676871168.0, + "grad_norm": 0.06368751268419225, + "language_loss": 0.87997532, + "learning_rate": 0.00018568695976519273, + "loss": 0.89091206, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.14440918, + "step": 3768, + "time_per_iteration": 2.835996389389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094879, + "balance_loss_mlp": 1.07991815, + "epoch": 0.7250865717583687, + "flos": 424941230592.0, + "grad_norm": 0.07271677335378793, + "language_loss": 0.80159616, + "learning_rate": 0.00018544473194425593, + "loss": 0.81254494, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.14941406, + "step": 3769, + "time_per_iteration": 2.51243257522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092608, + "balance_loss_mlp": 1.07782626, + "epoch": 0.7252789534436321, + "flos": 635114068992.0, + "grad_norm": 0.10799987095433689, + "language_loss": 0.78685284, + "learning_rate": 0.00018520262624450485, + "loss": 0.79777896, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.14770508, + "step": 3770, + "time_per_iteration": 2.936739444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095578, + "balance_loss_mlp": 1.08109403, + "epoch": 0.7254713351288957, + "flos": 617185930752.0, + "grad_norm": 0.05982613005726902, + "language_loss": 0.87150741, + "learning_rate": 0.00018496064275993324, + "loss": 0.88246322, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.14453125, + "step": 3771, + "time_per_iteration": 2.775094747543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087233, + "balance_loss_mlp": 1.07266569, + "epoch": 0.7256637168141593, + "flos": 766986983424.0, + "grad_norm": 0.07412314995641861, + "language_loss": 0.81699574, + "learning_rate": 0.00018471878158448686, + "loss": 0.82786798, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.14562988, + "step": 3772, + "time_per_iteration": 2.945774793624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093119, + "balance_loss_mlp": 1.07849216, + "epoch": 0.7258560984994229, + "flos": 495559503360.0, + "grad_norm": 0.0628089712415676, + "language_loss": 0.84061623, + "learning_rate": 0.00018447704281206512, + "loss": 0.85154736, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.14611816, + "step": 3773, + "time_per_iteration": 2.904330015182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085201, + "balance_loss_mlp": 1.07037139, + "epoch": 0.7260484801846864, + "flos": 530069681664.0, + "grad_norm": 0.06945926815964382, + "language_loss": 0.82613432, + "learning_rate": 0.0001842354265365191, + "loss": 0.83698636, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.14819336, + "step": 3774, + "time_per_iteration": 4.125708818435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089216, + "balance_loss_mlp": 1.07469606, + "epoch": 0.72624086186995, + "flos": 624964128768.0, + "grad_norm": 0.10416012988421754, + "language_loss": 0.80548131, + "learning_rate": 0.0001839939328516526, + "loss": 0.81637341, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.1451416, + "step": 3775, + "time_per_iteration": 2.750432014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086316, + "balance_loss_mlp": 1.07196307, + "epoch": 0.7264332435552135, + "flos": 716522858496.0, + "grad_norm": 0.07329543067618247, + "language_loss": 0.81326902, + "learning_rate": 0.0001837525618512218, + "loss": 0.8241322, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.14343262, + "step": 3776, + "time_per_iteration": 2.9147586822509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090392, + "balance_loss_mlp": 1.07571745, + "epoch": 0.7266256252404771, + "flos": 681036968448.0, + "grad_norm": 0.09666492868524106, + "language_loss": 0.82873899, + "learning_rate": 0.00018351131362893519, + "loss": 0.83964288, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.14660645, + "step": 3777, + "time_per_iteration": 2.857516050338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087388, + "balance_loss_mlp": 1.07228446, + "epoch": 0.7268180069257407, + "flos": 518906580480.0, + "grad_norm": 0.07721555161828438, + "language_loss": 0.80164421, + "learning_rate": 0.00018327018827845364, + "loss": 0.81251806, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.15087891, + "step": 3778, + "time_per_iteration": 2.6123135089874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088784, + "balance_loss_mlp": 1.07418132, + "epoch": 0.7270103886110042, + "flos": 512662804992.0, + "grad_norm": 0.07034168879093446, + "language_loss": 0.87450492, + "learning_rate": 0.00018302918589339036, + "loss": 0.88539279, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.14599609, + "step": 3779, + "time_per_iteration": 2.635622024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089717, + "balance_loss_mlp": 1.07491088, + "epoch": 0.7272027702962678, + "flos": 546653919744.0, + "grad_norm": 0.06972150146327356, + "language_loss": 0.89592332, + "learning_rate": 0.00018278830656731054, + "loss": 0.90682048, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.14782715, + "step": 3780, + "time_per_iteration": 2.7083652019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089508, + "balance_loss_mlp": 1.07489288, + "epoch": 0.7273951519815314, + "flos": 593048521728.0, + "grad_norm": 0.06413088918565218, + "language_loss": 0.86338055, + "learning_rate": 0.00018254755039373222, + "loss": 0.87427557, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.14599609, + "step": 3781, + "time_per_iteration": 2.746243953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083719, + "balance_loss_mlp": 1.06884193, + "epoch": 0.727587533666795, + "flos": 606012917760.0, + "grad_norm": 0.07626368504613235, + "language_loss": 0.83212483, + "learning_rate": 0.0001823069174661252, + "loss": 0.84296203, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.14855957, + "step": 3782, + "time_per_iteration": 2.8131794929504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076535, + "balance_loss_mlp": 1.06205118, + "epoch": 0.7277799153520584, + "flos": 513021081600.0, + "grad_norm": 0.06295680687034302, + "language_loss": 0.78633702, + "learning_rate": 0.00018206640787791112, + "loss": 0.79710239, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.14453125, + "step": 3783, + "time_per_iteration": 2.6886956691741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085999, + "balance_loss_mlp": 1.07144356, + "epoch": 0.727972297037322, + "flos": 537756475392.0, + "grad_norm": 0.06647490190453816, + "language_loss": 0.85873067, + "learning_rate": 0.00018182602172246416, + "loss": 0.86959064, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.14575195, + "step": 3784, + "time_per_iteration": 2.6511552333831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086575, + "balance_loss_mlp": 1.07187629, + "epoch": 0.7281646787225856, + "flos": 535038566400.0, + "grad_norm": 0.08017450109012514, + "language_loss": 0.76435304, + "learning_rate": 0.00018158575909311075, + "loss": 0.77521873, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.14685059, + "step": 3785, + "time_per_iteration": 2.681915044784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084547, + "balance_loss_mlp": 1.06974173, + "epoch": 0.7283570604078492, + "flos": 625055533056.0, + "grad_norm": 0.08921915239194265, + "language_loss": 0.79687071, + "learning_rate": 0.000181345620083129, + "loss": 0.80771625, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.14794922, + "step": 3786, + "time_per_iteration": 2.7836999893188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079267, + "balance_loss_mlp": 1.06438935, + "epoch": 0.7285494420931128, + "flos": 534173709312.0, + "grad_norm": 0.06165566569921882, + "language_loss": 0.86873049, + "learning_rate": 0.00018110560478574927, + "loss": 0.8795231, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.14855957, + "step": 3787, + "time_per_iteration": 2.7312710285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076321, + "balance_loss_mlp": 1.06149101, + "epoch": 0.7287418237783763, + "flos": 666548061696.0, + "grad_norm": 0.08287585285923407, + "language_loss": 0.80037522, + "learning_rate": 0.0001808657132941533, + "loss": 0.81113839, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.14807129, + "step": 3788, + "time_per_iteration": 2.8081939220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076757, + "balance_loss_mlp": 1.06220174, + "epoch": 0.7289342054636399, + "flos": 550602302976.0, + "grad_norm": 0.07558714577930627, + "language_loss": 0.83176941, + "learning_rate": 0.00018062594570147572, + "loss": 0.84253705, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.14562988, + "step": 3789, + "time_per_iteration": 2.6432437896728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083922, + "balance_loss_mlp": 1.0693903, + "epoch": 0.7291265871489034, + "flos": 687923145216.0, + "grad_norm": 0.07287349228687776, + "language_loss": 0.84963691, + "learning_rate": 0.00018038630210080243, + "loss": 0.86047614, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.14526367, + "step": 3790, + "time_per_iteration": 2.865356683731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073151, + "balance_loss_mlp": 1.05852365, + "epoch": 0.729318968834167, + "flos": 572664204288.0, + "grad_norm": 0.07168736979899527, + "language_loss": 0.8499006, + "learning_rate": 0.0001801467825851712, + "loss": 0.86063206, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.14611816, + "step": 3791, + "time_per_iteration": 2.7372162342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073095, + "balance_loss_mlp": 1.05845594, + "epoch": 0.7295113505194305, + "flos": 586061028864.0, + "grad_norm": 0.07281056570289735, + "language_loss": 0.78196633, + "learning_rate": 0.00017990738724757172, + "loss": 0.79269731, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.14611816, + "step": 3792, + "time_per_iteration": 2.8774027824401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107242, + "balance_loss_mlp": 1.05760276, + "epoch": 0.7297037322046941, + "flos": 707185645056.0, + "grad_norm": 0.06295411863995527, + "language_loss": 0.82293737, + "learning_rate": 0.00017966811618094598, + "loss": 0.83366162, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.14794922, + "step": 3793, + "time_per_iteration": 2.945582151412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074316, + "balance_loss_mlp": 1.05921233, + "epoch": 0.7298961138899577, + "flos": 487292350464.0, + "grad_norm": 0.08262020885938813, + "language_loss": 0.8475967, + "learning_rate": 0.00017942896947818664, + "loss": 0.85833991, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.15075684, + "step": 3794, + "time_per_iteration": 2.6285526752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_mlp": 1.01879299, + "epoch": 0.7300884955752213, + "flos": 1365804260352.0, + "grad_norm": 0.019285645442211487, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75850904, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.07080078, + "step": 3795, + "time_per_iteration": 4.929250478744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072587, + "balance_loss_mlp": 1.05812693, + "epoch": 0.7302808772604849, + "flos": 531806736384.0, + "grad_norm": 0.09431722804853598, + "language_loss": 0.85334897, + "learning_rate": 0.00017895104953559947, + "loss": 0.86407483, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.14453125, + "step": 3796, + "time_per_iteration": 2.605687141418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.06815124, + "epoch": 0.7304732589457483, + "flos": 436171143168.0, + "grad_norm": 0.08633113944613344, + "language_loss": 0.89526945, + "learning_rate": 0.00017871227648131672, + "loss": 0.9060964, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.1451416, + "step": 3797, + "time_per_iteration": 2.521352767944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072295, + "balance_loss_mlp": 1.05728662, + "epoch": 0.7306656406310119, + "flos": 451621080576.0, + "grad_norm": 0.06678098801503493, + "language_loss": 0.82943988, + "learning_rate": 0.0001784736281619907, + "loss": 0.84016287, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.14978027, + "step": 3798, + "time_per_iteration": 2.609084129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074305, + "balance_loss_mlp": 1.05968988, + "epoch": 0.7308580223162755, + "flos": 512010491904.0, + "grad_norm": 0.0786239518455689, + "language_loss": 0.74484026, + "learning_rate": 0.00017823510467027232, + "loss": 0.75558329, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.14599609, + "step": 3799, + "time_per_iteration": 2.8423922061920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067282, + "balance_loss_mlp": 1.05283403, + "epoch": 0.7310504040015391, + "flos": 375423455232.0, + "grad_norm": 0.07912584621582001, + "language_loss": 0.78262001, + "learning_rate": 0.00017799670609876516, + "loss": 0.79329282, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.14477539, + "step": 3800, + "time_per_iteration": 2.535236120223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071664, + "balance_loss_mlp": 1.05734682, + "epoch": 0.7312427856868026, + "flos": 549334752768.0, + "grad_norm": 0.06546690696594622, + "language_loss": 0.88949418, + "learning_rate": 0.00017775843254002366, + "loss": 0.90021086, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.14306641, + "step": 3801, + "time_per_iteration": 2.7845892906188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075479, + "balance_loss_mlp": 1.06116223, + "epoch": 0.7314351673720662, + "flos": 767238801408.0, + "grad_norm": 0.06442177991273089, + "language_loss": 0.83698308, + "learning_rate": 0.00017752028408655367, + "loss": 0.84773785, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.14306641, + "step": 3802, + "time_per_iteration": 3.0654079914093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073558, + "balance_loss_mlp": 1.05856121, + "epoch": 0.7316275490573297, + "flos": 486734012928.0, + "grad_norm": 0.177225948577802, + "language_loss": 0.85229474, + "learning_rate": 0.00017728226083081272, + "loss": 0.86303031, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.14978027, + "step": 3803, + "time_per_iteration": 2.5718350410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074242, + "balance_loss_mlp": 1.05967474, + "epoch": 0.7318199307425933, + "flos": 473428592640.0, + "grad_norm": 0.08565568804066387, + "language_loss": 0.81454623, + "learning_rate": 0.00017704436286520965, + "loss": 0.82528865, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.14538574, + "step": 3804, + "time_per_iteration": 2.6038320064544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106992, + "balance_loss_mlp": 1.05504251, + "epoch": 0.7320123124278569, + "flos": 549463233024.0, + "grad_norm": 0.12360179299397371, + "language_loss": 0.8468073, + "learning_rate": 0.0001768065902821046, + "loss": 0.85750651, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.1484375, + "step": 3805, + "time_per_iteration": 2.651048183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071694, + "balance_loss_mlp": 1.05726933, + "epoch": 0.7322046941131204, + "flos": 570781416960.0, + "grad_norm": 0.07802569861836066, + "language_loss": 0.82316971, + "learning_rate": 0.00017656894317380907, + "loss": 0.83388662, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.14416504, + "step": 3806, + "time_per_iteration": 2.756763219833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014485, + "balance_loss_mlp": 1.00723755, + "epoch": 0.732397075798384, + "flos": 1469165548032.0, + "grad_norm": 0.009270144136788097, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77045751, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.07226562, + "step": 3807, + "time_per_iteration": 5.025331735610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075041, + "balance_loss_mlp": 1.06017613, + "epoch": 0.7325894574836476, + "flos": 464862260736.0, + "grad_norm": 0.08110176134528321, + "language_loss": 0.83730799, + "learning_rate": 0.00017609402575064875, + "loss": 0.8480584, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.1484375, + "step": 3808, + "time_per_iteration": 2.5651097297668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080989, + "balance_loss_mlp": 1.06662416, + "epoch": 0.7327818391689112, + "flos": 495493065216.0, + "grad_norm": 0.07932211737712976, + "language_loss": 0.81102324, + "learning_rate": 0.00017585675562016367, + "loss": 0.82183307, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.14355469, + "step": 3809, + "time_per_iteration": 2.6230361461639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016113, + "balance_loss_mlp": 1.00881767, + "epoch": 0.7329742208541746, + "flos": 1433489508864.0, + "grad_norm": 0.01295473198731384, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78228962, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.07275391, + "step": 3810, + "time_per_iteration": 4.819159746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081965, + "balance_loss_mlp": 1.06713569, + "epoch": 0.7331666025394382, + "flos": 496889095680.0, + "grad_norm": 0.07185927058157819, + "language_loss": 0.8484388, + "learning_rate": 0.00017538259298196474, + "loss": 0.85925841, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.14819336, + "step": 3811, + "time_per_iteration": 2.5887067317962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079969, + "balance_loss_mlp": 1.06556845, + "epoch": 0.7333589842247018, + "flos": 538524785664.0, + "grad_norm": 0.0628616136872852, + "language_loss": 0.81993341, + "learning_rate": 0.00017514570065833745, + "loss": 0.83073318, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.14379883, + "step": 3812, + "time_per_iteration": 2.7150583267211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082723, + "balance_loss_mlp": 1.06795263, + "epoch": 0.7335513659099654, + "flos": 491067836928.0, + "grad_norm": 0.08278360701185013, + "language_loss": 0.80552948, + "learning_rate": 0.00017490893445433426, + "loss": 0.81635672, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.14746094, + "step": 3813, + "time_per_iteration": 2.595290422439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080582, + "balance_loss_mlp": 1.0658834, + "epoch": 0.733743747595229, + "flos": 562150844928.0, + "grad_norm": 0.06487588714867228, + "language_loss": 0.81347382, + "learning_rate": 0.00017467229446187587, + "loss": 0.82427955, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.14709473, + "step": 3814, + "time_per_iteration": 2.7173616886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.07540596, + "epoch": 0.7339361292804925, + "flos": 538581685248.0, + "grad_norm": 0.08798326338090434, + "language_loss": 0.81541699, + "learning_rate": 0.00017443578077283424, + "loss": 0.82631671, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.14550781, + "step": 3815, + "time_per_iteration": 2.7105531692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087021, + "balance_loss_mlp": 1.07223916, + "epoch": 0.734128510965756, + "flos": 548469895680.0, + "grad_norm": 0.06566892057084078, + "language_loss": 0.84730685, + "learning_rate": 0.0001741993934790319, + "loss": 0.85817701, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.14770508, + "step": 3816, + "time_per_iteration": 2.77266001701355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107762, + "balance_loss_mlp": 1.06295753, + "epoch": 0.7343208926510196, + "flos": 540066548736.0, + "grad_norm": 0.09067152232159664, + "language_loss": 0.84255576, + "learning_rate": 0.00017396313267224273, + "loss": 0.85333198, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.14660645, + "step": 3817, + "time_per_iteration": 2.7289724349975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082678, + "balance_loss_mlp": 1.06814599, + "epoch": 0.7345132743362832, + "flos": 571095277056.0, + "grad_norm": 0.07934793398680723, + "language_loss": 0.8837018, + "learning_rate": 0.0001737269984441912, + "loss": 0.89452857, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.14526367, + "step": 3818, + "time_per_iteration": 2.679121255874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108093, + "balance_loss_mlp": 1.06629074, + "epoch": 0.7347056560215467, + "flos": 545403621888.0, + "grad_norm": 0.06604620451137376, + "language_loss": 0.85161746, + "learning_rate": 0.00017349099088655263, + "loss": 0.86242676, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.14624023, + "step": 3819, + "time_per_iteration": 2.7354836463928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107929, + "balance_loss_mlp": 1.06509197, + "epoch": 0.7348980377068103, + "flos": 595949239296.0, + "grad_norm": 0.06952246164346525, + "language_loss": 0.80691403, + "learning_rate": 0.00017325511009095375, + "loss": 0.81770694, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.14208984, + "step": 3820, + "time_per_iteration": 2.7548413276672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072543, + "balance_loss_mlp": 1.05823815, + "epoch": 0.7350904193920739, + "flos": 538554521088.0, + "grad_norm": 0.06808643977119672, + "language_loss": 0.83516192, + "learning_rate": 0.00017301935614897113, + "loss": 0.8458873, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.14318848, + "step": 3821, + "time_per_iteration": 2.7016494274139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074862, + "balance_loss_mlp": 1.0602231, + "epoch": 0.7352828010773375, + "flos": 512981434368.0, + "grad_norm": 0.06002582073431783, + "language_loss": 0.8197211, + "learning_rate": 0.00017278372915213274, + "loss": 0.83046979, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.14624023, + "step": 3822, + "time_per_iteration": 2.6761066913604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016556, + "balance_loss_mlp": 1.00921309, + "epoch": 0.735475182762601, + "flos": 1553820848640.0, + "grad_norm": 0.014100117797771941, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80910403, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.07324219, + "step": 3823, + "time_per_iteration": 5.009763956069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074081, + "balance_loss_mlp": 1.05965686, + "epoch": 0.7356675644478645, + "flos": 681308610048.0, + "grad_norm": 0.08234273424412843, + "language_loss": 0.806014, + "learning_rate": 0.00017231285635975314, + "loss": 0.81675482, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.1439209, + "step": 3824, + "time_per_iteration": 2.9129364490509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069863, + "balance_loss_mlp": 1.05441332, + "epoch": 0.7358599461331281, + "flos": 515215157760.0, + "grad_norm": 0.08116369820319415, + "language_loss": 0.82920796, + "learning_rate": 0.00017207761074702115, + "loss": 0.83990657, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.1541748, + "step": 3825, + "time_per_iteration": 2.641977071762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069473, + "balance_loss_mlp": 1.05479813, + "epoch": 0.7360523278183917, + "flos": 443973934080.0, + "grad_norm": 0.06363910261754813, + "language_loss": 0.83689082, + "learning_rate": 0.0001718424924450514, + "loss": 0.8475855, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.14660645, + "step": 3826, + "time_per_iteration": 2.6134989261627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106716, + "balance_loss_mlp": 1.05211544, + "epoch": 0.7362447095036553, + "flos": 603423489024.0, + "grad_norm": 0.06392814442784994, + "language_loss": 0.85810113, + "learning_rate": 0.00017160750154512482, + "loss": 0.86877275, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.15026855, + "step": 3827, + "time_per_iteration": 2.736499786376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071488, + "balance_loss_mlp": 1.05717158, + "epoch": 0.7364370911889189, + "flos": 553095184896.0, + "grad_norm": 0.060676486527101066, + "language_loss": 0.83347571, + "learning_rate": 0.0001713726381384731, + "loss": 0.8441906, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.14318848, + "step": 3828, + "time_per_iteration": 2.7891271114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067133, + "balance_loss_mlp": 1.05237508, + "epoch": 0.7366294728741823, + "flos": 449061387264.0, + "grad_norm": 0.07991922680329289, + "language_loss": 0.81341559, + "learning_rate": 0.00017113790231627812, + "loss": 0.8240869, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.14733887, + "step": 3829, + "time_per_iteration": 2.525070905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011404, + "balance_loss_mlp": 1.00415587, + "epoch": 0.7368218545594459, + "flos": 1535502500352.0, + "grad_norm": 0.013118834983913303, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80269623, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.07226562, + "step": 3830, + "time_per_iteration": 4.838621377944946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106649, + "balance_loss_mlp": 1.05186284, + "epoch": 0.7370142362447095, + "flos": 515425130496.0, + "grad_norm": 0.07911932608285421, + "language_loss": 0.81592512, + "learning_rate": 0.00017066881378973936, + "loss": 0.82659006, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.14587402, + "step": 3831, + "time_per_iteration": 2.7149910926818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106728, + "balance_loss_mlp": 1.0528084, + "epoch": 0.7372066179299731, + "flos": 500805172224.0, + "grad_norm": 0.06667618306638196, + "language_loss": 0.82793903, + "learning_rate": 0.00017043446126751189, + "loss": 0.8386119, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.14453125, + "step": 3832, + "time_per_iteration": 2.6927688121795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106817, + "balance_loss_mlp": 1.0533402, + "epoch": 0.7373989996152366, + "flos": 558083893248.0, + "grad_norm": 0.07864183191565526, + "language_loss": 0.76522374, + "learning_rate": 0.00017020023669397376, + "loss": 0.77590549, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.14819336, + "step": 3833, + "time_per_iteration": 2.7058162689208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107006, + "balance_loss_mlp": 1.05456233, + "epoch": 0.7375913813005002, + "flos": 506777306112.0, + "grad_norm": 0.08760745702981601, + "language_loss": 0.81515223, + "learning_rate": 0.0001699661401600589, + "loss": 0.82585281, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.15478516, + "step": 3834, + "time_per_iteration": 2.6158528327941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106653, + "balance_loss_mlp": 1.05206978, + "epoch": 0.7377837629857638, + "flos": 486183015936.0, + "grad_norm": 0.07963589333837205, + "language_loss": 0.78064704, + "learning_rate": 0.00016973217175665205, + "loss": 0.79131228, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.14453125, + "step": 3835, + "time_per_iteration": 2.6113386154174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005306, + "balance_loss_mlp": 0.99843931, + "epoch": 0.7379761446710273, + "flos": 1414693942272.0, + "grad_norm": 0.007558216579010849, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82171464, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.06884766, + "step": 3836, + "time_per_iteration": 4.930665493011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071082, + "balance_loss_mlp": 1.05599046, + "epoch": 0.7381685263562909, + "flos": 629737721856.0, + "grad_norm": 0.07838551299757777, + "language_loss": 0.84225684, + "learning_rate": 0.00016926461970465047, + "loss": 0.85296762, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.15063477, + "step": 3837, + "time_per_iteration": 2.7925772666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069523, + "balance_loss_mlp": 1.05480027, + "epoch": 0.7383609080415544, + "flos": 739224589824.0, + "grad_norm": 0.06636874651090781, + "language_loss": 0.84278762, + "learning_rate": 0.00016903103623757516, + "loss": 0.8534829, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.14709473, + "step": 3838, + "time_per_iteration": 3.077704429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_mlp": 1.0535419, + "epoch": 0.738553289726818, + "flos": 550206950400.0, + "grad_norm": 0.0837224725732271, + "language_loss": 0.79925728, + "learning_rate": 0.00016879758126404738, + "loss": 0.80994093, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.14819336, + "step": 3839, + "time_per_iteration": 2.7352871894836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.05714512, + "epoch": 0.7387456714120816, + "flos": 910294640640.0, + "grad_norm": 0.07590823763574843, + "language_loss": 0.80038518, + "learning_rate": 0.00016856425487470216, + "loss": 0.81110722, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.15039062, + "step": 3840, + "time_per_iteration": 3.1132917404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070306, + "balance_loss_mlp": 1.05497539, + "epoch": 0.7389380530973452, + "flos": 852684807168.0, + "grad_norm": 0.0859256588835734, + "language_loss": 0.78885496, + "learning_rate": 0.00016833105716012486, + "loss": 0.79955798, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.15307617, + "step": 3841, + "time_per_iteration": 3.1532208919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067671, + "balance_loss_mlp": 1.05297232, + "epoch": 0.7391304347826086, + "flos": 817026020352.0, + "grad_norm": 0.06792202219363284, + "language_loss": 0.84900254, + "learning_rate": 0.00016809798821085088, + "loss": 0.85967922, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.14660645, + "step": 3842, + "time_per_iteration": 3.01279354095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070418, + "balance_loss_mlp": 1.05537403, + "epoch": 0.7393228164678722, + "flos": 572819848704.0, + "grad_norm": 0.0638380683182141, + "language_loss": 0.88969815, + "learning_rate": 0.00016786504811736565, + "loss": 0.90040231, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.15014648, + "step": 3843, + "time_per_iteration": 2.6979498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071016, + "balance_loss_mlp": 1.05594802, + "epoch": 0.7395151981531358, + "flos": 685237169664.0, + "grad_norm": 0.061553978302081376, + "language_loss": 0.82327366, + "learning_rate": 0.00016763223697010442, + "loss": 0.83398378, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.1505127, + "step": 3844, + "time_per_iteration": 2.9502556324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067572, + "balance_loss_mlp": 1.05302894, + "epoch": 0.7397075798383994, + "flos": 556366662144.0, + "grad_norm": 0.056403600780772105, + "language_loss": 0.84155715, + "learning_rate": 0.00016739955485945256, + "loss": 0.85223293, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.14538574, + "step": 3845, + "time_per_iteration": 2.7162668704986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070346, + "balance_loss_mlp": 1.05569506, + "epoch": 0.739899961523663, + "flos": 546782400000.0, + "grad_norm": 0.081576664955192, + "language_loss": 0.86097336, + "learning_rate": 0.00016716700187574513, + "loss": 0.87167686, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.14648438, + "step": 3846, + "time_per_iteration": 2.6993191242218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073439, + "balance_loss_mlp": 1.0585022, + "epoch": 0.7400923432089265, + "flos": 609190419456.0, + "grad_norm": 0.06966979530394013, + "language_loss": 0.83732522, + "learning_rate": 0.0001669345781092675, + "loss": 0.84805954, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.14916992, + "step": 3847, + "time_per_iteration": 2.770108699798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075946, + "balance_loss_mlp": 1.06135464, + "epoch": 0.7402847248941901, + "flos": 591007518720.0, + "grad_norm": 0.06701111666950413, + "language_loss": 0.8687951, + "learning_rate": 0.0001667022836502546, + "loss": 0.87955451, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.14587402, + "step": 3848, + "time_per_iteration": 2.7933013439178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075686, + "balance_loss_mlp": 1.06097555, + "epoch": 0.7404771065794536, + "flos": 477369635328.0, + "grad_norm": 0.10971052102255037, + "language_loss": 0.8283127, + "learning_rate": 0.00016647011858889077, + "loss": 0.8390696, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.14709473, + "step": 3849, + "time_per_iteration": 2.5821588039398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107146, + "balance_loss_mlp": 1.05672526, + "epoch": 0.7406694882647172, + "flos": 496446755328.0, + "grad_norm": 0.08016384906089048, + "language_loss": 0.86103845, + "learning_rate": 0.00016623808301531056, + "loss": 0.87175304, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.1472168, + "step": 3850, + "time_per_iteration": 2.6669719219207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071974, + "balance_loss_mlp": 1.05720425, + "epoch": 0.7408618699499807, + "flos": 562205173248.0, + "grad_norm": 0.08205354684217782, + "language_loss": 0.79157412, + "learning_rate": 0.00016600617701959842, + "loss": 0.80229384, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.14746094, + "step": 3851, + "time_per_iteration": 2.747596502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006701, + "balance_loss_mlp": 1.00007319, + "epoch": 0.7410542516352443, + "flos": 1388228834304.0, + "grad_norm": 0.0072472756451880905, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79850513, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.06640625, + "step": 3852, + "time_per_iteration": 5.075153350830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074994, + "balance_loss_mlp": 1.06011701, + "epoch": 0.7412466333205079, + "flos": 669999776256.0, + "grad_norm": 0.07625474461693704, + "language_loss": 0.81200403, + "learning_rate": 0.00016554275412186315, + "loss": 0.82275391, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.14868164, + "step": 3853, + "time_per_iteration": 2.83164119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069097, + "balance_loss_mlp": 1.05436301, + "epoch": 0.7414390150057715, + "flos": 489293706240.0, + "grad_norm": 0.08701956870254486, + "language_loss": 0.80909944, + "learning_rate": 0.0001653112373997568, + "loss": 0.81979048, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.1472168, + "step": 3854, + "time_per_iteration": 2.6991629600524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072539, + "balance_loss_mlp": 1.057459, + "epoch": 0.7416313966910351, + "flos": 599393613312.0, + "grad_norm": 0.08599035855505702, + "language_loss": 0.7489301, + "learning_rate": 0.0001650798506153517, + "loss": 0.75965548, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.1505127, + "step": 3855, + "time_per_iteration": 2.6856653690338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070697, + "balance_loss_mlp": 1.05554581, + "epoch": 0.7418237783762985, + "flos": 542539980288.0, + "grad_norm": 0.07804718077998271, + "language_loss": 0.84300339, + "learning_rate": 0.00016484859385848023, + "loss": 0.85371041, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.15124512, + "step": 3856, + "time_per_iteration": 2.637263059616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065824, + "balance_loss_mlp": 1.0510422, + "epoch": 0.7420161600615621, + "flos": 544136071680.0, + "grad_norm": 0.07531615972312422, + "language_loss": 0.77476895, + "learning_rate": 0.0001646174672189243, + "loss": 0.78542721, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.14770508, + "step": 3857, + "time_per_iteration": 2.6742300987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072034, + "balance_loss_mlp": 1.0568707, + "epoch": 0.7422085417468257, + "flos": 527178875904.0, + "grad_norm": 0.07664417369096119, + "language_loss": 0.80383694, + "learning_rate": 0.00016438647078641488, + "loss": 0.81455731, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.15148926, + "step": 3858, + "time_per_iteration": 2.6050353050231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070582, + "balance_loss_mlp": 1.05539477, + "epoch": 0.7424009234320893, + "flos": 508674774528.0, + "grad_norm": 0.07203197801736921, + "language_loss": 0.83144253, + "learning_rate": 0.00016415560465063344, + "loss": 0.84214836, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.15161133, + "step": 3859, + "time_per_iteration": 2.7623064517974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072258, + "balance_loss_mlp": 1.05751181, + "epoch": 0.7425933051173528, + "flos": 512598564864.0, + "grad_norm": 0.06874041780278002, + "language_loss": 0.79038745, + "learning_rate": 0.0001639248689012095, + "loss": 0.80111003, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.14733887, + "step": 3860, + "time_per_iteration": 2.5939531326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.05512857, + "epoch": 0.7427856868026164, + "flos": 458302053888.0, + "grad_norm": 0.07350694530214436, + "language_loss": 0.87617624, + "learning_rate": 0.00016369426362772271, + "loss": 0.88687891, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.15136719, + "step": 3861, + "time_per_iteration": 2.8275485038757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072029, + "balance_loss_mlp": 1.057199, + "epoch": 0.74297806848788, + "flos": 605019580416.0, + "grad_norm": 0.0620300979873649, + "language_loss": 0.80084789, + "learning_rate": 0.00016346378891970233, + "loss": 0.8115682, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.14807129, + "step": 3862, + "time_per_iteration": 2.903923988342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078214, + "balance_loss_mlp": 1.06349134, + "epoch": 0.7431704501731435, + "flos": 891390044160.0, + "grad_norm": 0.08373807590972174, + "language_loss": 0.81505513, + "learning_rate": 0.00016323344486662633, + "loss": 0.82583725, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.14697266, + "step": 3863, + "time_per_iteration": 3.331378221511841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075731, + "balance_loss_mlp": 1.06079412, + "epoch": 0.7433628318584071, + "flos": 592163841024.0, + "grad_norm": 0.06625022737773377, + "language_loss": 0.78612608, + "learning_rate": 0.00016300323155792247, + "loss": 0.79688334, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.14941406, + "step": 3864, + "time_per_iteration": 2.961841583251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070996, + "balance_loss_mlp": 1.05604696, + "epoch": 0.7435552135436706, + "flos": 477154520064.0, + "grad_norm": 0.06667559747166675, + "language_loss": 0.88657868, + "learning_rate": 0.00016277314908296687, + "loss": 0.89728856, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.14929199, + "step": 3865, + "time_per_iteration": 2.6347100734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072634, + "balance_loss_mlp": 1.05754232, + "epoch": 0.7437475952289342, + "flos": 673184618496.0, + "grad_norm": 0.09401519790686412, + "language_loss": 0.76145202, + "learning_rate": 0.00016254319753108604, + "loss": 0.77217835, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.15075684, + "step": 3866, + "time_per_iteration": 2.87519907951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069984, + "balance_loss_mlp": 1.05523825, + "epoch": 0.7439399769141978, + "flos": 770428786176.0, + "grad_norm": 0.07662525510674034, + "language_loss": 0.76897246, + "learning_rate": 0.00016231337699155492, + "loss": 0.77967227, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.14733887, + "step": 3867, + "time_per_iteration": 2.983419418334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074113, + "balance_loss_mlp": 1.05875885, + "epoch": 0.7441323585994614, + "flos": 647777088000.0, + "grad_norm": 0.06858824495499428, + "language_loss": 0.78350103, + "learning_rate": 0.0001620836875535977, + "loss": 0.79424214, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.15332031, + "step": 3868, + "time_per_iteration": 2.917475461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074549, + "balance_loss_mlp": 1.05954003, + "epoch": 0.7443247402847248, + "flos": 565372763136.0, + "grad_norm": 0.07353784330896508, + "language_loss": 0.80791712, + "learning_rate": 0.00016185412930638766, + "loss": 0.81866264, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.14990234, + "step": 3869, + "time_per_iteration": 2.8665554523468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073848, + "balance_loss_mlp": 1.05905402, + "epoch": 0.7445171219699884, + "flos": 578529879552.0, + "grad_norm": 0.07383455824846064, + "language_loss": 0.82674599, + "learning_rate": 0.00016162470233904765, + "loss": 0.83748442, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.14782715, + "step": 3870, + "time_per_iteration": 2.7601962089538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074818, + "balance_loss_mlp": 1.05997705, + "epoch": 0.744709503655252, + "flos": 618875997696.0, + "grad_norm": 0.07839397285168428, + "language_loss": 0.82130003, + "learning_rate": 0.00016139540674064856, + "loss": 0.8320483, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.14819336, + "step": 3871, + "time_per_iteration": 2.733957290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076556, + "balance_loss_mlp": 1.0619173, + "epoch": 0.7449018853405156, + "flos": 528619322880.0, + "grad_norm": 0.07038188530007786, + "language_loss": 0.77430081, + "learning_rate": 0.00016116624260021113, + "loss": 0.78506637, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.14624023, + "step": 3872, + "time_per_iteration": 2.800231456756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071128, + "balance_loss_mlp": 1.05622649, + "epoch": 0.7450942670257792, + "flos": 433314842112.0, + "grad_norm": 0.08400374472729004, + "language_loss": 0.83973575, + "learning_rate": 0.0001609372100067046, + "loss": 0.85044706, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.14892578, + "step": 3873, + "time_per_iteration": 2.5605225563049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071204, + "balance_loss_mlp": 1.05576611, + "epoch": 0.7452866487110427, + "flos": 696882258432.0, + "grad_norm": 0.09625530023155114, + "language_loss": 0.84883142, + "learning_rate": 0.0001607083090490475, + "loss": 0.85954344, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.1541748, + "step": 3874, + "time_per_iteration": 2.9394900798797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071613, + "balance_loss_mlp": 1.05664086, + "epoch": 0.7454790303963063, + "flos": 512210552832.0, + "grad_norm": 0.08058298210668029, + "language_loss": 0.79820347, + "learning_rate": 0.00016047953981610714, + "loss": 0.80891967, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.14953613, + "step": 3875, + "time_per_iteration": 2.7139272689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021273, + "balance_loss_mlp": 1.01435852, + "epoch": 0.7456714120815698, + "flos": 1325949668352.0, + "grad_norm": 0.011941727483725444, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80750912, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.06933594, + "step": 3876, + "time_per_iteration": 4.9671149253845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076145, + "balance_loss_mlp": 1.06137538, + "epoch": 0.7458637937668334, + "flos": 721711627776.0, + "grad_norm": 0.07051283090717735, + "language_loss": 0.80756205, + "learning_rate": 0.0001600223968795889, + "loss": 0.81832355, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.14758301, + "step": 3877, + "time_per_iteration": 2.9416282176971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020704, + "balance_loss_mlp": 1.01379037, + "epoch": 0.746056175452097, + "flos": 1501580395008.0, + "grad_norm": 0.011482501801392642, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76716781, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.06933594, + "step": 3878, + "time_per_iteration": 4.898989677429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075351, + "balance_loss_mlp": 1.06042576, + "epoch": 0.7462485571373605, + "flos": 520245711360.0, + "grad_norm": 0.0885851208026398, + "language_loss": 0.81747985, + "learning_rate": 0.00015956578190706483, + "loss": 0.82823336, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.14904785, + "step": 3879, + "time_per_iteration": 2.6805362701416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066599, + "balance_loss_mlp": 1.05175781, + "epoch": 0.7464409388226241, + "flos": 481206790656.0, + "grad_norm": 0.07723337503848805, + "language_loss": 0.75796139, + "learning_rate": 0.00015933767262892468, + "loss": 0.76862741, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.14831543, + "step": 3880, + "time_per_iteration": 2.7313079833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107623, + "balance_loss_mlp": 1.06153107, + "epoch": 0.7466333205078877, + "flos": 486761177088.0, + "grad_norm": 0.08607243998977276, + "language_loss": 0.82115054, + "learning_rate": 0.00015910969560762927, + "loss": 0.83191288, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.14685059, + "step": 3881, + "time_per_iteration": 2.633983612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074372, + "balance_loss_mlp": 1.05975699, + "epoch": 0.7468257021931513, + "flos": 611293091328.0, + "grad_norm": 0.07136699104926861, + "language_loss": 0.83270466, + "learning_rate": 0.00015888185093168727, + "loss": 0.8434484, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.14587402, + "step": 3882, + "time_per_iteration": 2.7481329441070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107382, + "balance_loss_mlp": 1.05850148, + "epoch": 0.7470180838784147, + "flos": 533459727360.0, + "grad_norm": 0.07343335643807868, + "language_loss": 0.81235325, + "learning_rate": 0.00015865413868955581, + "loss": 0.82309145, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.15319824, + "step": 3883, + "time_per_iteration": 2.651353120803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064128, + "balance_loss_mlp": 1.04927468, + "epoch": 0.7472104655636783, + "flos": 739338388992.0, + "grad_norm": 0.07434119530275363, + "language_loss": 0.82377172, + "learning_rate": 0.00015842655896964054, + "loss": 0.83441293, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.14831543, + "step": 3884, + "time_per_iteration": 3.050145149230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071848, + "balance_loss_mlp": 1.05682731, + "epoch": 0.7474028472489419, + "flos": 640305409536.0, + "grad_norm": 0.06949199138359925, + "language_loss": 0.73357499, + "learning_rate": 0.00015819911186029567, + "loss": 0.74429345, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.14990234, + "step": 3885, + "time_per_iteration": 2.8004651069641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073306, + "balance_loss_mlp": 1.05846465, + "epoch": 0.7475952289342055, + "flos": 590249120256.0, + "grad_norm": 0.07642701531837649, + "language_loss": 0.86417222, + "learning_rate": 0.00015797179744982443, + "loss": 0.87490523, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.14831543, + "step": 3886, + "time_per_iteration": 2.7258265018463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071859, + "balance_loss_mlp": 1.05695772, + "epoch": 0.7477876106194691, + "flos": 488191712256.0, + "grad_norm": 0.06842586328042619, + "language_loss": 0.78899908, + "learning_rate": 0.00015774461582647765, + "loss": 0.79971766, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.14868164, + "step": 3887, + "time_per_iteration": 2.6812551021575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067811, + "balance_loss_mlp": 1.0530405, + "epoch": 0.7479799923047326, + "flos": 554733494784.0, + "grad_norm": 0.07125585996553076, + "language_loss": 0.81060201, + "learning_rate": 0.00015751756707845505, + "loss": 0.82128012, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.14746094, + "step": 3888, + "time_per_iteration": 2.6297996044158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076062, + "balance_loss_mlp": 1.06092191, + "epoch": 0.7481723739899961, + "flos": 767387105280.0, + "grad_norm": 0.06726063528868273, + "language_loss": 0.87855798, + "learning_rate": 0.00015729065129390502, + "loss": 0.88931859, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.15112305, + "step": 3889, + "time_per_iteration": 3.0159242153167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074487, + "balance_loss_mlp": 1.0599674, + "epoch": 0.7483647556752597, + "flos": 496172542464.0, + "grad_norm": 0.10838691697932842, + "language_loss": 0.8232426, + "learning_rate": 0.0001570638685609241, + "loss": 0.83398747, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.14501953, + "step": 3890, + "time_per_iteration": 2.6125991344451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107077, + "balance_loss_mlp": 1.05557132, + "epoch": 0.7485571373605233, + "flos": 472850431488.0, + "grad_norm": 0.08016904552002256, + "language_loss": 0.80514675, + "learning_rate": 0.00015683721896755693, + "loss": 0.81585443, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.1517334, + "step": 3891, + "time_per_iteration": 2.5569143295288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013078, + "balance_loss_mlp": 1.00649726, + "epoch": 0.7487495190457868, + "flos": 1554473161728.0, + "grad_norm": 0.007010152707011992, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83223569, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.06591797, + "step": 3892, + "time_per_iteration": 4.943824052810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071888, + "balance_loss_mlp": 1.05717778, + "epoch": 0.7489419007310504, + "flos": 581845773312.0, + "grad_norm": 0.07161004761849712, + "language_loss": 0.85438764, + "learning_rate": 0.00015638431955158528, + "loss": 0.86510646, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.14697266, + "step": 3893, + "time_per_iteration": 2.7677009105682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072508, + "balance_loss_mlp": 1.057881, + "epoch": 0.749134282416314, + "flos": 567576751104.0, + "grad_norm": 0.06398748480098922, + "language_loss": 0.80855525, + "learning_rate": 0.00015615806990481186, + "loss": 0.81928039, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.14611816, + "step": 3894, + "time_per_iteration": 2.7350878715515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074761, + "balance_loss_mlp": 1.05946612, + "epoch": 0.7493266641015776, + "flos": 533061803520.0, + "grad_norm": 0.06232267001853924, + "language_loss": 0.84572965, + "learning_rate": 0.00015593195374931452, + "loss": 0.85647732, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.15270996, + "step": 3895, + "time_per_iteration": 2.7310590744018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070156, + "balance_loss_mlp": 1.05569601, + "epoch": 0.7495190457868411, + "flos": 523613362176.0, + "grad_norm": 0.10454645503772597, + "language_loss": 0.80334634, + "learning_rate": 0.00015570597117287922, + "loss": 0.81404787, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.14453125, + "step": 3896, + "time_per_iteration": 2.6727263927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069845, + "balance_loss_mlp": 1.05496776, + "epoch": 0.7497114274721046, + "flos": 514187315712.0, + "grad_norm": 0.08797347720338106, + "language_loss": 0.77618623, + "learning_rate": 0.0001554801222632406, + "loss": 0.78688467, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.14868164, + "step": 3897, + "time_per_iteration": 2.625335931777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073048, + "balance_loss_mlp": 1.05813491, + "epoch": 0.7499038091573682, + "flos": 495006308352.0, + "grad_norm": 0.06959868179345496, + "language_loss": 0.85080492, + "learning_rate": 0.00015525440710808052, + "loss": 0.86153543, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.14892578, + "step": 3898, + "time_per_iteration": 2.613952875137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068734, + "balance_loss_mlp": 1.05366588, + "epoch": 0.7500961908426318, + "flos": 737658233856.0, + "grad_norm": 0.08867238273395864, + "language_loss": 0.77680039, + "learning_rate": 0.00015502882579502953, + "loss": 0.78748775, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.1505127, + "step": 3899, + "time_per_iteration": 2.960392951965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068744, + "balance_loss_mlp": 1.05395043, + "epoch": 0.7502885725278954, + "flos": 533400256512.0, + "grad_norm": 0.06630940736811984, + "language_loss": 0.84808308, + "learning_rate": 0.00015480337841166592, + "loss": 0.85877049, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.14770508, + "step": 3900, + "time_per_iteration": 2.732886552810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_mlp": 1.06013203, + "epoch": 0.7504809542131589, + "flos": 589324792320.0, + "grad_norm": 0.12261327498313113, + "language_loss": 0.82749909, + "learning_rate": 0.00015457806504551647, + "loss": 0.8382417, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.14135742, + "step": 3901, + "time_per_iteration": 2.8467769622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072304, + "balance_loss_mlp": 1.05728328, + "epoch": 0.7506733358984224, + "flos": 511550899200.0, + "grad_norm": 0.06753844274961214, + "language_loss": 0.77791429, + "learning_rate": 0.0001543528857840554, + "loss": 0.78863734, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.14990234, + "step": 3902, + "time_per_iteration": 2.6523211002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071569, + "balance_loss_mlp": 1.05731118, + "epoch": 0.750865717583686, + "flos": 539268503040.0, + "grad_norm": 0.09602264980762555, + "language_loss": 0.80487525, + "learning_rate": 0.000154127840714705, + "loss": 0.81559098, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.14257812, + "step": 3903, + "time_per_iteration": 2.8009955883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068794, + "balance_loss_mlp": 1.05425048, + "epoch": 0.7510580992689496, + "flos": 476578930176.0, + "grad_norm": 0.08455294978842176, + "language_loss": 0.82418245, + "learning_rate": 0.00015390292992483557, + "loss": 0.8348704, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.1451416, + "step": 3904, + "time_per_iteration": 2.5455572605133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071227, + "balance_loss_mlp": 1.05671942, + "epoch": 0.7512504809542132, + "flos": 579043800576.0, + "grad_norm": 0.08735450092332898, + "language_loss": 0.84165967, + "learning_rate": 0.00015367815350176523, + "loss": 0.85237193, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.14501953, + "step": 3905, + "time_per_iteration": 2.7836532592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067778, + "balance_loss_mlp": 1.05274606, + "epoch": 0.7514428626394767, + "flos": 418660379136.0, + "grad_norm": 0.07770341183537, + "language_loss": 0.82813609, + "learning_rate": 0.00015345351153275987, + "loss": 0.83881384, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.15002441, + "step": 3906, + "time_per_iteration": 2.5773417949676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073295, + "balance_loss_mlp": 1.05825067, + "epoch": 0.7516352443247403, + "flos": 641039215104.0, + "grad_norm": 0.06258787162872337, + "language_loss": 0.80409896, + "learning_rate": 0.00015322900410503332, + "loss": 0.81483191, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.15026855, + "step": 3907, + "time_per_iteration": 2.8312478065490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068679, + "balance_loss_mlp": 1.05444527, + "epoch": 0.7518276260100039, + "flos": 580998168576.0, + "grad_norm": 0.07094809333059562, + "language_loss": 0.77488625, + "learning_rate": 0.00015300463130574703, + "loss": 0.78557301, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.14245605, + "step": 3908, + "time_per_iteration": 4.43429160118103 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073575, + "balance_loss_mlp": 1.05884063, + "epoch": 0.7520200076952674, + "flos": 687342412800.0, + "grad_norm": 0.07651069808134531, + "language_loss": 0.81860089, + "learning_rate": 0.00015278039322201033, + "loss": 0.82933658, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.1472168, + "step": 3909, + "time_per_iteration": 2.999046564102173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069479, + "balance_loss_mlp": 1.05531669, + "epoch": 0.7522123893805309, + "flos": 486439976448.0, + "grad_norm": 0.09637101763600625, + "language_loss": 0.79630423, + "learning_rate": 0.00015255628994088004, + "loss": 0.80699903, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.14160156, + "step": 3910, + "time_per_iteration": 2.5875840187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075904, + "balance_loss_mlp": 1.06090784, + "epoch": 0.7524047710657945, + "flos": 818982586368.0, + "grad_norm": 0.10609317146357068, + "language_loss": 0.75265759, + "learning_rate": 0.00015233232154936082, + "loss": 0.76341665, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.14978027, + "step": 3911, + "time_per_iteration": 3.2563164234161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.05859196, + "epoch": 0.7525971527510581, + "flos": 699508763136.0, + "grad_norm": 0.11854285995885969, + "language_loss": 0.76211643, + "learning_rate": 0.0001521084881344048, + "loss": 0.7728529, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.15039062, + "step": 3912, + "time_per_iteration": 2.867192029953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107173, + "balance_loss_mlp": 1.05700815, + "epoch": 0.7527895344363217, + "flos": 633787421184.0, + "grad_norm": 0.0664126315840823, + "language_loss": 0.86507452, + "learning_rate": 0.00015188478978291208, + "loss": 0.87579179, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.14697266, + "step": 3913, + "time_per_iteration": 2.80972957611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071284, + "balance_loss_mlp": 1.05657363, + "epoch": 0.7529819161215853, + "flos": 562830322176.0, + "grad_norm": 0.08394603234641039, + "language_loss": 0.86425006, + "learning_rate": 0.00015166122658173014, + "loss": 0.87496293, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.14697266, + "step": 3914, + "time_per_iteration": 2.8178954124450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106559, + "balance_loss_mlp": 1.05062914, + "epoch": 0.7531742978068487, + "flos": 690665647104.0, + "grad_norm": 0.08202440069993752, + "language_loss": 0.88477957, + "learning_rate": 0.00015143779861765332, + "loss": 0.89543545, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.14953613, + "step": 3915, + "time_per_iteration": 2.920933961868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066546, + "balance_loss_mlp": 1.05214548, + "epoch": 0.7533666794921123, + "flos": 681101208576.0, + "grad_norm": 0.09013491853638725, + "language_loss": 0.81357694, + "learning_rate": 0.00015121450597742458, + "loss": 0.82424241, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.14379883, + "step": 3916, + "time_per_iteration": 2.858567714691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069931, + "balance_loss_mlp": 1.05475569, + "epoch": 0.7535590611773759, + "flos": 623669414400.0, + "grad_norm": 0.07911049580666238, + "language_loss": 0.78366303, + "learning_rate": 0.00015099134874773369, + "loss": 0.79436231, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.15148926, + "step": 3917, + "time_per_iteration": 2.7468197345733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064775, + "balance_loss_mlp": 1.04978991, + "epoch": 0.7537514428626395, + "flos": 519427842048.0, + "grad_norm": 0.06092774973766905, + "language_loss": 0.79940081, + "learning_rate": 0.00015076832701521793, + "loss": 0.81004852, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.14953613, + "step": 3918, + "time_per_iteration": 2.7390952110290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067305, + "balance_loss_mlp": 1.05246365, + "epoch": 0.753943824547903, + "flos": 723653512704.0, + "grad_norm": 0.0783583526919487, + "language_loss": 0.81940138, + "learning_rate": 0.000150545440866462, + "loss": 0.83007443, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.14831543, + "step": 3919, + "time_per_iteration": 2.999077558517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074124, + "balance_loss_mlp": 1.05919874, + "epoch": 0.7541362062331666, + "flos": 437547350016.0, + "grad_norm": 0.13926190935015714, + "language_loss": 0.78544766, + "learning_rate": 0.000150322690387998, + "loss": 0.79618883, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.14904785, + "step": 3920, + "time_per_iteration": 2.5101473331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.05396128, + "epoch": 0.7543285879184302, + "flos": 565274018304.0, + "grad_norm": 0.07965418146183906, + "language_loss": 0.75046188, + "learning_rate": 0.00015010007566630535, + "loss": 0.76114827, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.14648438, + "step": 3921, + "time_per_iteration": 2.741964101791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067461, + "balance_loss_mlp": 1.05235684, + "epoch": 0.7545209696036937, + "flos": 521036416512.0, + "grad_norm": 0.11708553927616697, + "language_loss": 0.81529438, + "learning_rate": 0.00014987759678781077, + "loss": 0.82596898, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.15087891, + "step": 3922, + "time_per_iteration": 2.648132562637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066406, + "balance_loss_mlp": 1.05142117, + "epoch": 0.7547133512889573, + "flos": 616066684416.0, + "grad_norm": 0.07358029420700156, + "language_loss": 0.82236576, + "learning_rate": 0.00014965525383888795, + "loss": 0.83302975, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.1496582, + "step": 3923, + "time_per_iteration": 2.8054702281951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064259, + "balance_loss_mlp": 1.04944098, + "epoch": 0.7549057329742208, + "flos": 750845085696.0, + "grad_norm": 0.064367666918871, + "language_loss": 0.72265434, + "learning_rate": 0.00014943304690585851, + "loss": 0.73329699, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.14794922, + "step": 3924, + "time_per_iteration": 2.9498682022094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070108, + "balance_loss_mlp": 1.0550642, + "epoch": 0.7550981146594844, + "flos": 514444276224.0, + "grad_norm": 0.09368808464599959, + "language_loss": 0.78766346, + "learning_rate": 0.0001492109760749908, + "loss": 0.79836458, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.15026855, + "step": 3925, + "time_per_iteration": 2.6306443214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070609, + "balance_loss_mlp": 1.05580342, + "epoch": 0.755290496344748, + "flos": 522009930240.0, + "grad_norm": 0.06789463297635422, + "language_loss": 0.79637897, + "learning_rate": 0.00014898904143250002, + "loss": 0.8070851, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.14770508, + "step": 3926, + "time_per_iteration": 2.675294876098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021023, + "balance_loss_mlp": 1.01463342, + "epoch": 0.7554828780300116, + "flos": 1414615021056.0, + "grad_norm": 0.012183216489225542, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76776218, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.06396484, + "step": 3927, + "time_per_iteration": 4.929020166397095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077252, + "balance_loss_mlp": 1.06279135, + "epoch": 0.7556752597152752, + "flos": 556937482752.0, + "grad_norm": 0.08141405861107236, + "language_loss": 0.79880834, + "learning_rate": 0.0001485455810572474, + "loss": 0.80958086, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.14453125, + "step": 3928, + "time_per_iteration": 2.6965065002441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081004, + "balance_loss_mlp": 1.06655574, + "epoch": 0.7558676414005386, + "flos": 563638279680.0, + "grad_norm": 0.061395348363909676, + "language_loss": 0.84046453, + "learning_rate": 0.00014832405549665236, + "loss": 0.85127461, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.14440918, + "step": 3929, + "time_per_iteration": 2.7616498470306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108063, + "balance_loss_mlp": 1.06583571, + "epoch": 0.7560600230858022, + "flos": 561377392128.0, + "grad_norm": 0.07976690960726483, + "language_loss": 0.7883532, + "learning_rate": 0.00014810266646876746, + "loss": 0.79915947, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.14794922, + "step": 3930, + "time_per_iteration": 2.778254747390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080018, + "balance_loss_mlp": 1.06545067, + "epoch": 0.7562524047710658, + "flos": 719576649216.0, + "grad_norm": 0.08838808443584828, + "language_loss": 0.7752986, + "learning_rate": 0.00014788141405954364, + "loss": 0.78609884, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.14538574, + "step": 3931, + "time_per_iteration": 3.053114891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078954, + "balance_loss_mlp": 1.06433892, + "epoch": 0.7564447864563294, + "flos": 543347937792.0, + "grad_norm": 0.08282527395338529, + "language_loss": 0.85036647, + "learning_rate": 0.00014766029835487865, + "loss": 0.86115611, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.14599609, + "step": 3932, + "time_per_iteration": 2.7713563442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088765, + "balance_loss_mlp": 1.07401931, + "epoch": 0.7566371681415929, + "flos": 725805743616.0, + "grad_norm": 0.09534253325991678, + "language_loss": 0.79310846, + "learning_rate": 0.0001474393194406173, + "loss": 0.80399615, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.14733887, + "step": 3933, + "time_per_iteration": 2.9569146633148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081686, + "balance_loss_mlp": 1.06690359, + "epoch": 0.7568295498268565, + "flos": 576580280832.0, + "grad_norm": 0.06403839142600674, + "language_loss": 0.79947335, + "learning_rate": 0.00014721847740255112, + "loss": 0.81029022, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.14758301, + "step": 3934, + "time_per_iteration": 2.835782766342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018298, + "balance_loss_mlp": 1.01190841, + "epoch": 0.75702193151212, + "flos": 1520059903488.0, + "grad_norm": 0.010361786732504343, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74930221, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.06396484, + "step": 3935, + "time_per_iteration": 4.648789167404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079127, + "balance_loss_mlp": 1.06448829, + "epoch": 0.7572143131973836, + "flos": 525471556608.0, + "grad_norm": 0.08867148183843263, + "language_loss": 0.78082466, + "learning_rate": 0.00014677720429790526, + "loss": 0.79161596, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.1463623, + "step": 3936, + "time_per_iteration": 2.620413064956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073911, + "balance_loss_mlp": 1.05959392, + "epoch": 0.7574066948826472, + "flos": 550738123776.0, + "grad_norm": 0.05592268057464008, + "language_loss": 0.84353757, + "learning_rate": 0.0001465567734026429, + "loss": 0.85427672, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.14306641, + "step": 3937, + "time_per_iteration": 2.725707769393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075578, + "balance_loss_mlp": 1.06070113, + "epoch": 0.7575990765679107, + "flos": 395899176960.0, + "grad_norm": 0.08941460340947252, + "language_loss": 0.82362223, + "learning_rate": 0.00014633647972621034, + "loss": 0.834378, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.14868164, + "step": 3938, + "time_per_iteration": 2.4839630126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073907, + "balance_loss_mlp": 1.05917263, + "epoch": 0.7577914582531743, + "flos": 585030615552.0, + "grad_norm": 0.06669605761909986, + "language_loss": 0.8624711, + "learning_rate": 0.00014611632335413354, + "loss": 0.87321013, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.14709473, + "step": 3939, + "time_per_iteration": 2.791856527328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071326, + "balance_loss_mlp": 1.05647278, + "epoch": 0.7579838399384379, + "flos": 820979172864.0, + "grad_norm": 0.06597748597273165, + "language_loss": 0.82603717, + "learning_rate": 0.00014589630437188456, + "loss": 0.83675039, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.14831543, + "step": 3940, + "time_per_iteration": 3.1954329013824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081609, + "balance_loss_mlp": 1.06717253, + "epoch": 0.7581762216237015, + "flos": 443892441600.0, + "grad_norm": 0.08139847599649805, + "language_loss": 0.78537852, + "learning_rate": 0.00014567642286488253, + "loss": 0.79619455, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.14428711, + "step": 3941, + "time_per_iteration": 2.5560035705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074149, + "balance_loss_mlp": 1.05904555, + "epoch": 0.7583686033089649, + "flos": 540886989312.0, + "grad_norm": 0.08568953404097215, + "language_loss": 0.79163635, + "learning_rate": 0.00014545667891849258, + "loss": 0.80237788, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.15100098, + "step": 3942, + "time_per_iteration": 2.6567327976226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073928, + "balance_loss_mlp": 1.05890775, + "epoch": 0.7585609849942285, + "flos": 522588091392.0, + "grad_norm": 0.08481557046486428, + "language_loss": 0.82241571, + "learning_rate": 0.00014523707261802733, + "loss": 0.83315504, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.14990234, + "step": 3943, + "time_per_iteration": 2.6527955532073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107533, + "balance_loss_mlp": 1.0604291, + "epoch": 0.7587533666794921, + "flos": 541860503040.0, + "grad_norm": 0.07206762548440185, + "language_loss": 0.81135172, + "learning_rate": 0.00014501760404874527, + "loss": 0.82210505, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.14868164, + "step": 3944, + "time_per_iteration": 2.750162124633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072936, + "balance_loss_mlp": 1.05839252, + "epoch": 0.7589457483647557, + "flos": 606408270336.0, + "grad_norm": 0.07713855070396991, + "language_loss": 0.85369128, + "learning_rate": 0.00014479827329585176, + "loss": 0.86442065, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.14538574, + "step": 3945, + "time_per_iteration": 2.755915641784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069268, + "balance_loss_mlp": 1.05458164, + "epoch": 0.7591381300500193, + "flos": 555106452480.0, + "grad_norm": 0.06696753824594462, + "language_loss": 0.84734225, + "learning_rate": 0.00014457908044449846, + "loss": 0.85803485, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.14685059, + "step": 3946, + "time_per_iteration": 2.794527292251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066827, + "balance_loss_mlp": 1.05199742, + "epoch": 0.7593305117352828, + "flos": 529681669632.0, + "grad_norm": 0.22019165201908963, + "language_loss": 0.8300361, + "learning_rate": 0.00014436002557978371, + "loss": 0.84070432, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.14794922, + "step": 3947, + "time_per_iteration": 2.7884273529052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011645, + "balance_loss_mlp": 1.00530314, + "epoch": 0.7595228934205464, + "flos": 1502798759424.0, + "grad_norm": 0.007510047355142999, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77654791, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.06347656, + "step": 3948, + "time_per_iteration": 4.91646671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.05669069, + "epoch": 0.7597152751058099, + "flos": 455525047296.0, + "grad_norm": 0.06244939704933972, + "language_loss": 0.79716647, + "learning_rate": 0.0001439223301503945, + "loss": 0.80787796, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.14440918, + "step": 3949, + "time_per_iteration": 2.5492866039276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071573, + "balance_loss_mlp": 1.05702949, + "epoch": 0.7599076567910735, + "flos": 685466966016.0, + "grad_norm": 0.07710584125137034, + "language_loss": 0.76199448, + "learning_rate": 0.00014370368975564834, + "loss": 0.7727102, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.14526367, + "step": 3950, + "time_per_iteration": 2.9614758491516113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107362, + "balance_loss_mlp": 1.05911183, + "epoch": 0.760100038476337, + "flos": 532372414464.0, + "grad_norm": 0.08081333921040441, + "language_loss": 0.83668613, + "learning_rate": 0.00014348518768739766, + "loss": 0.84742236, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.14477539, + "step": 3951, + "time_per_iteration": 2.7431232929229736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013923, + "balance_loss_mlp": 1.00758147, + "epoch": 0.7602924201616006, + "flos": 1471742866944.0, + "grad_norm": 0.008820182172653682, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77741963, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.06347656, + "step": 3952, + "time_per_iteration": 4.910484790802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066227, + "balance_loss_mlp": 1.05181456, + "epoch": 0.7604848018468642, + "flos": 774631558656.0, + "grad_norm": 0.06688586461179376, + "language_loss": 0.86683798, + "learning_rate": 0.00014304859886964867, + "loss": 0.8775003, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.14416504, + "step": 3953, + "time_per_iteration": 3.0533196926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106398, + "balance_loss_mlp": 1.04932904, + "epoch": 0.7606771835321278, + "flos": 558185209344.0, + "grad_norm": 0.08095687676459093, + "language_loss": 0.83446729, + "learning_rate": 0.00014283051228964878, + "loss": 0.84510708, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.14624023, + "step": 3954, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063032, + "balance_loss_mlp": 1.04855967, + "epoch": 0.7608695652173914, + "flos": 525397404672.0, + "grad_norm": 0.07612254012233202, + "language_loss": 0.82667398, + "learning_rate": 0.00014261256437514197, + "loss": 0.83730423, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.14477539, + "step": 3955, + "time_per_iteration": 2.635387897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064275, + "balance_loss_mlp": 1.05008912, + "epoch": 0.7610619469026548, + "flos": 615038842368.0, + "grad_norm": 0.07371649985569284, + "language_loss": 0.82440156, + "learning_rate": 0.0001423947552107428, + "loss": 0.83504432, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.14196777, + "step": 3956, + "time_per_iteration": 2.737157106399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_mlp": 1.05186772, + "epoch": 0.7612543285879184, + "flos": 863356382208.0, + "grad_norm": 0.0738552534680633, + "language_loss": 0.76961863, + "learning_rate": 0.00014217708488101243, + "loss": 0.78028303, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.14575195, + "step": 3957, + "time_per_iteration": 3.0698153972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071692, + "balance_loss_mlp": 1.05686283, + "epoch": 0.761446710273182, + "flos": 553658664960.0, + "grad_norm": 0.08088514343400555, + "language_loss": 0.77329475, + "learning_rate": 0.0001419595534704579, + "loss": 0.78401166, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.14807129, + "step": 3958, + "time_per_iteration": 2.714460611343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072652, + "balance_loss_mlp": 1.05804873, + "epoch": 0.7616390919584456, + "flos": 467350373376.0, + "grad_norm": 0.07036888376906092, + "language_loss": 0.81419134, + "learning_rate": 0.00014174216106353237, + "loss": 0.82491785, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.14575195, + "step": 3959, + "time_per_iteration": 2.6297383308410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071387, + "balance_loss_mlp": 1.05678439, + "epoch": 0.7618314736437091, + "flos": 498430858752.0, + "grad_norm": 0.08691180014870155, + "language_loss": 0.76360267, + "learning_rate": 0.00014152490774463512, + "loss": 0.77431655, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.14599609, + "step": 3960, + "time_per_iteration": 2.650489568710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068154, + "balance_loss_mlp": 1.05350327, + "epoch": 0.7620238553289727, + "flos": 434545316352.0, + "grad_norm": 0.10467283388045306, + "language_loss": 0.87295103, + "learning_rate": 0.00014130779359811135, + "loss": 0.88363254, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.14624023, + "step": 3961, + "time_per_iteration": 2.470933437347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069904, + "balance_loss_mlp": 1.05552769, + "epoch": 0.7622162370142362, + "flos": 664277262336.0, + "grad_norm": 0.06695122847081907, + "language_loss": 0.85981679, + "learning_rate": 0.0001410908187082521, + "loss": 0.87051582, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.14379883, + "step": 3962, + "time_per_iteration": 2.8771471977233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068735, + "balance_loss_mlp": 1.05375075, + "epoch": 0.7624086186994998, + "flos": 557965324800.0, + "grad_norm": 0.0731663524296794, + "language_loss": 0.83243585, + "learning_rate": 0.0001408739831592949, + "loss": 0.8431232, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.14953613, + "step": 3963, + "time_per_iteration": 2.719027280807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072758, + "balance_loss_mlp": 1.05827415, + "epoch": 0.7626010003847634, + "flos": 629132396544.0, + "grad_norm": 0.09597126350862907, + "language_loss": 0.77261025, + "learning_rate": 0.0001406572870354224, + "loss": 0.78333783, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.14477539, + "step": 3964, + "time_per_iteration": 2.8318536281585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_mlp": 1.05142951, + "epoch": 0.7627933820700269, + "flos": 437942702592.0, + "grad_norm": 0.06846487833206179, + "language_loss": 0.86648387, + "learning_rate": 0.00014044073042076337, + "loss": 0.87714344, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.14501953, + "step": 3965, + "time_per_iteration": 2.5620529651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073227, + "balance_loss_mlp": 1.05873156, + "epoch": 0.7629857637552905, + "flos": 532723350528.0, + "grad_norm": 0.08133731345364971, + "language_loss": 0.89009362, + "learning_rate": 0.00014022431339939302, + "loss": 0.90082592, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.14489746, + "step": 3966, + "time_per_iteration": 2.6854476928710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071043, + "balance_loss_mlp": 1.05605876, + "epoch": 0.7631781454405541, + "flos": 680036290560.0, + "grad_norm": 0.08711941543983692, + "language_loss": 0.78104591, + "learning_rate": 0.00014000803605533163, + "loss": 0.79175639, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.1496582, + "step": 3967, + "time_per_iteration": 2.83705735206604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.06324911, + "epoch": 0.7633705271258177, + "flos": 507493859328.0, + "grad_norm": 0.09829351187117948, + "language_loss": 0.83671606, + "learning_rate": 0.00013979189847254553, + "loss": 0.84749299, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.14440918, + "step": 3968, + "time_per_iteration": 2.5781285762786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075641, + "balance_loss_mlp": 1.0607276, + "epoch": 0.7635629088110811, + "flos": 618866085888.0, + "grad_norm": 0.08084752979294811, + "language_loss": 0.80726254, + "learning_rate": 0.00013957590073494674, + "loss": 0.81801891, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.14904785, + "step": 3969, + "time_per_iteration": 2.8175971508026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070147, + "balance_loss_mlp": 1.05593681, + "epoch": 0.7637552904963447, + "flos": 638425193472.0, + "grad_norm": 0.08048508029980411, + "language_loss": 0.78762692, + "learning_rate": 0.0001393600429263931, + "loss": 0.7983284, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.14208984, + "step": 3970, + "time_per_iteration": 2.7563693523406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021274, + "balance_loss_mlp": 1.01469386, + "epoch": 0.7639476721816083, + "flos": 1563222302208.0, + "grad_norm": 0.013272156084273934, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75766158, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.06591797, + "step": 3971, + "time_per_iteration": 4.93863320350647 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067066, + "balance_loss_mlp": 1.05265367, + "epoch": 0.7641400538668719, + "flos": 495987162624.0, + "grad_norm": 0.0632460471594908, + "language_loss": 0.81507617, + "learning_rate": 0.0001389287474315804, + "loss": 0.82574689, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.14404297, + "step": 3972, + "time_per_iteration": 2.6393582820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074461, + "balance_loss_mlp": 1.05995345, + "epoch": 0.7643324355521355, + "flos": 578441046528.0, + "grad_norm": 0.06341816515754745, + "language_loss": 0.80192941, + "learning_rate": 0.00013871330991276505, + "loss": 0.81267405, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.1451416, + "step": 3973, + "time_per_iteration": 2.714632987976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070348, + "balance_loss_mlp": 1.05616236, + "epoch": 0.764524817237399, + "flos": 784823717376.0, + "grad_norm": 0.085490997753428, + "language_loss": 0.80806011, + "learning_rate": 0.00013849801265788247, + "loss": 0.81876361, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.14196777, + "step": 3974, + "time_per_iteration": 4.533233880996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069636, + "balance_loss_mlp": 1.05451989, + "epoch": 0.7647171989226625, + "flos": 526279514112.0, + "grad_norm": 0.07052252307543246, + "language_loss": 0.8329643, + "learning_rate": 0.00013828285575051818, + "loss": 0.84366071, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.15100098, + "step": 3975, + "time_per_iteration": 2.6609082221984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068505, + "balance_loss_mlp": 1.05420017, + "epoch": 0.7649095806079261, + "flos": 554876656128.0, + "grad_norm": 0.07307751300876789, + "language_loss": 0.84132665, + "learning_rate": 0.0001380678392742035, + "loss": 0.85201168, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.14306641, + "step": 3976, + "time_per_iteration": 2.804594039916992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065652, + "balance_loss_mlp": 1.05060732, + "epoch": 0.7651019622931897, + "flos": 649145954304.0, + "grad_norm": 0.06679898937130221, + "language_loss": 0.84919113, + "learning_rate": 0.00013785296331241526, + "loss": 0.85984766, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.15039062, + "step": 3977, + "time_per_iteration": 2.8787760734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066608, + "balance_loss_mlp": 1.0521363, + "epoch": 0.7652943439784533, + "flos": 1046449248768.0, + "grad_norm": 0.08384386833632657, + "language_loss": 0.87255394, + "learning_rate": 0.00013763822794857583, + "loss": 0.88322002, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.14477539, + "step": 3978, + "time_per_iteration": 3.372908115386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063846, + "balance_loss_mlp": 1.04954064, + "epoch": 0.7654867256637168, + "flos": 504350862336.0, + "grad_norm": 0.07264681342413916, + "language_loss": 0.90047956, + "learning_rate": 0.00013742363326605278, + "loss": 0.91111797, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.14306641, + "step": 3979, + "time_per_iteration": 2.7111921310424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068001, + "balance_loss_mlp": 1.05338621, + "epoch": 0.7656791073489804, + "flos": 574709976576.0, + "grad_norm": 0.06616632618822393, + "language_loss": 0.78682995, + "learning_rate": 0.00013720917934815935, + "loss": 0.79750991, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.14599609, + "step": 3980, + "time_per_iteration": 2.7665858268737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064513, + "balance_loss_mlp": 1.04985034, + "epoch": 0.765871489034244, + "flos": 492812232192.0, + "grad_norm": 0.0792407009711433, + "language_loss": 0.82811975, + "learning_rate": 0.00013699486627815344, + "loss": 0.83876491, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.1463623, + "step": 3981, + "time_per_iteration": 2.5893211364746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067597, + "balance_loss_mlp": 1.05322027, + "epoch": 0.7660638707195075, + "flos": 486024800256.0, + "grad_norm": 0.06352647599324608, + "language_loss": 0.82432151, + "learning_rate": 0.00013678069413923928, + "loss": 0.83499742, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.14379883, + "step": 3982, + "time_per_iteration": 2.6204872131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063042, + "balance_loss_mlp": 1.04862928, + "epoch": 0.766256252404771, + "flos": 444295134720.0, + "grad_norm": 0.07852318401052459, + "language_loss": 0.82138562, + "learning_rate": 0.00013656666301456555, + "loss": 0.83201599, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.1439209, + "step": 3983, + "time_per_iteration": 2.5520832538604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065895, + "balance_loss_mlp": 1.05096984, + "epoch": 0.7664486340900346, + "flos": 485179766784.0, + "grad_norm": 0.06488313554531835, + "language_loss": 0.84368253, + "learning_rate": 0.0001363527729872267, + "loss": 0.85434151, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.14904785, + "step": 3984, + "time_per_iteration": 2.7092504501342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065315, + "balance_loss_mlp": 1.05033016, + "epoch": 0.7666410157752982, + "flos": 646200820224.0, + "grad_norm": 0.07270873670315516, + "language_loss": 0.76720321, + "learning_rate": 0.00013613902414026207, + "loss": 0.77785635, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.1496582, + "step": 3985, + "time_per_iteration": 2.8448526859283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065572, + "balance_loss_mlp": 1.05049181, + "epoch": 0.7668333974605618, + "flos": 774303017472.0, + "grad_norm": 0.07569693962897468, + "language_loss": 0.82453251, + "learning_rate": 0.00013592541655665642, + "loss": 0.83518815, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.1505127, + "step": 3986, + "time_per_iteration": 3.0181548595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070428, + "balance_loss_mlp": 1.05580068, + "epoch": 0.7670257791458254, + "flos": 613462574592.0, + "grad_norm": 0.08265865172273029, + "language_loss": 0.85586035, + "learning_rate": 0.00013571195031933947, + "loss": 0.86656457, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.14611816, + "step": 3987, + "time_per_iteration": 2.7126588821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006841, + "balance_loss_mlp": 1.00030851, + "epoch": 0.7672181608310888, + "flos": 1485357378048.0, + "grad_norm": 0.01029491447835557, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.8148818, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.06542969, + "step": 3988, + "time_per_iteration": 4.7078423500061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061224, + "balance_loss_mlp": 1.046525, + "epoch": 0.7674105425163524, + "flos": 610732182528.0, + "grad_norm": 0.06747537690646892, + "language_loss": 0.85686624, + "learning_rate": 0.00013528544221501655, + "loss": 0.86747837, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.14685059, + "step": 3989, + "time_per_iteration": 2.734370470046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076335, + "balance_loss_mlp": 1.06144583, + "epoch": 0.767602924201616, + "flos": 845205788160.0, + "grad_norm": 0.0637335052103759, + "language_loss": 0.81435496, + "learning_rate": 0.00013507240051359586, + "loss": 0.8251183, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.14868164, + "step": 3990, + "time_per_iteration": 3.06548810005188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071514, + "balance_loss_mlp": 1.05666053, + "epoch": 0.7677953058868796, + "flos": 527114635776.0, + "grad_norm": 0.19838733588160684, + "language_loss": 0.85903186, + "learning_rate": 0.00013485950048963425, + "loss": 0.86974698, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.1484375, + "step": 3991, + "time_per_iteration": 2.6094348430633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106551, + "balance_loss_mlp": 1.05060887, + "epoch": 0.7679876875721431, + "flos": 923550501888.0, + "grad_norm": 0.07043981174766001, + "language_loss": 0.82674527, + "learning_rate": 0.00013464674222578643, + "loss": 0.83740032, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.14880371, + "step": 3992, + "time_per_iteration": 3.2195329666137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069425, + "balance_loss_mlp": 1.0544765, + "epoch": 0.7681800692574067, + "flos": 458087311872.0, + "grad_norm": 0.07032959255599644, + "language_loss": 0.83132064, + "learning_rate": 0.00013443412580465292, + "loss": 0.84201485, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.14929199, + "step": 3993, + "time_per_iteration": 2.5895824432373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065769, + "balance_loss_mlp": 1.05068934, + "epoch": 0.7683724509426703, + "flos": 658436179968.0, + "grad_norm": 0.06321728097990122, + "language_loss": 0.83854759, + "learning_rate": 0.00013422165130877857, + "loss": 0.84920526, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.15063477, + "step": 3994, + "time_per_iteration": 2.925792932510376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069232, + "balance_loss_mlp": 1.05415177, + "epoch": 0.7685648326279338, + "flos": 555284491776.0, + "grad_norm": 0.07271740437502876, + "language_loss": 0.80652654, + "learning_rate": 0.00013400931882065327, + "loss": 0.8172189, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.15063477, + "step": 3995, + "time_per_iteration": 2.6709957122802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065338, + "balance_loss_mlp": 1.0499239, + "epoch": 0.7687572143131974, + "flos": 687404081664.0, + "grad_norm": 0.06876581607663422, + "language_loss": 0.81030929, + "learning_rate": 0.0001337971284227118, + "loss": 0.82096267, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.15393066, + "step": 3996, + "time_per_iteration": 3.056353807449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008904, + "balance_loss_mlp": 1.00222826, + "epoch": 0.7689495959984609, + "flos": 1489453691904.0, + "grad_norm": 0.013387325374254085, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77127326, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.06689453, + "step": 3997, + "time_per_iteration": 4.957718133926392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064729, + "balance_loss_mlp": 1.04914832, + "epoch": 0.7691419776837245, + "flos": 570405888000.0, + "grad_norm": 0.06293795645279736, + "language_loss": 0.80514187, + "learning_rate": 0.0001333731742268438, + "loss": 0.81578922, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.15576172, + "step": 3998, + "time_per_iteration": 2.712575674057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063537, + "balance_loss_mlp": 1.04812336, + "epoch": 0.7693343593689881, + "flos": 520087495680.0, + "grad_norm": 0.06867525176596115, + "language_loss": 0.85581779, + "learning_rate": 0.0001331614105935109, + "loss": 0.86645317, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.15393066, + "step": 3999, + "time_per_iteration": 2.7334744930267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061501, + "balance_loss_mlp": 1.04600382, + "epoch": 0.7695267410542517, + "flos": 660378438144.0, + "grad_norm": 0.06588382908784379, + "language_loss": 0.84056103, + "learning_rate": 0.00013294978937954883, + "loss": 0.85117608, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.15490723, + "step": 4000, + "time_per_iteration": 2.8713667392730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_mlp": 1.04591429, + "epoch": 0.7697191227395151, + "flos": 546809564160.0, + "grad_norm": 0.11170111036218774, + "language_loss": 0.85502183, + "learning_rate": 0.00013273831066711655, + "loss": 0.86563516, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.15393066, + "step": 4001, + "time_per_iteration": 2.674727201461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059012, + "balance_loss_mlp": 1.04409897, + "epoch": 0.7699115044247787, + "flos": 540610205184.0, + "grad_norm": 0.06526458774457519, + "language_loss": 0.80125463, + "learning_rate": 0.00013252697453831747, + "loss": 0.81184471, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.14880371, + "step": 4002, + "time_per_iteration": 2.7256710529327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063942, + "balance_loss_mlp": 1.04832566, + "epoch": 0.7701038861100423, + "flos": 562936407552.0, + "grad_norm": 0.06842053131152107, + "language_loss": 0.82420772, + "learning_rate": 0.00013231578107519916, + "loss": 0.83484715, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.15600586, + "step": 4003, + "time_per_iteration": 2.9035251140594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106275, + "balance_loss_mlp": 1.04752731, + "epoch": 0.7702962677953059, + "flos": 481737964032.0, + "grad_norm": 0.07973091789387209, + "language_loss": 0.82878852, + "learning_rate": 0.00013210473035975422, + "loss": 0.83941609, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.15209961, + "step": 4004, + "time_per_iteration": 2.628084182739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010592, + "balance_loss_mlp": 1.04378664, + "epoch": 0.7704886494805695, + "flos": 770389138944.0, + "grad_norm": 0.08630684221581464, + "language_loss": 0.85682714, + "learning_rate": 0.0001318938224739201, + "loss": 0.86741912, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.15393066, + "step": 4005, + "time_per_iteration": 3.1021761894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061315, + "balance_loss_mlp": 1.04588926, + "epoch": 0.770681031165833, + "flos": 601192336896.0, + "grad_norm": 0.06315324541354835, + "language_loss": 0.83698732, + "learning_rate": 0.00013168305749957843, + "loss": 0.84760046, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.1541748, + "step": 4006, + "time_per_iteration": 2.8380637168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061238, + "balance_loss_mlp": 1.04613376, + "epoch": 0.7708734128510966, + "flos": 496108302336.0, + "grad_norm": 0.07282324785530167, + "language_loss": 0.82726502, + "learning_rate": 0.00013147243551855532, + "loss": 0.83787745, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.15075684, + "step": 4007, + "time_per_iteration": 2.6003365516662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064008, + "balance_loss_mlp": 1.04878509, + "epoch": 0.7710657945363601, + "flos": 567299966976.0, + "grad_norm": 0.07719085419162308, + "language_loss": 0.80652189, + "learning_rate": 0.00013126195661262148, + "loss": 0.81716192, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.15209961, + "step": 4008, + "time_per_iteration": 2.762053966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066486, + "balance_loss_mlp": 1.05129838, + "epoch": 0.7712581762216237, + "flos": 604550075904.0, + "grad_norm": 0.07418966803723698, + "language_loss": 0.86903155, + "learning_rate": 0.00013105162086349216, + "loss": 0.87969637, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.15161133, + "step": 4009, + "time_per_iteration": 2.8321642875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060916, + "balance_loss_mlp": 1.04613364, + "epoch": 0.7714505579068872, + "flos": 530894891520.0, + "grad_norm": 0.07303373639120146, + "language_loss": 0.8590073, + "learning_rate": 0.00013084142835282687, + "loss": 0.86961645, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.14770508, + "step": 4010, + "time_per_iteration": 2.7242491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005899, + "balance_loss_mlp": 0.9993664, + "epoch": 0.7716429395921508, + "flos": 1422205267968.0, + "grad_norm": 0.003197258642765861, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80890262, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.06542969, + "step": 4011, + "time_per_iteration": 4.762616395950317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106388, + "balance_loss_mlp": 1.04903793, + "epoch": 0.7718353212774144, + "flos": 578428563456.0, + "grad_norm": 0.08194546236645563, + "language_loss": 0.89672923, + "learning_rate": 0.0001304214733732485, + "loss": 0.90736794, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.1484375, + "step": 4012, + "time_per_iteration": 2.7599334716796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065761, + "balance_loss_mlp": 1.05091929, + "epoch": 0.772027702962678, + "flos": 510742941696.0, + "grad_norm": 0.07424002912728798, + "language_loss": 0.82715225, + "learning_rate": 0.00013021171106737672, + "loss": 0.83780992, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.14831543, + "step": 4013, + "time_per_iteration": 2.6886706352233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063572, + "balance_loss_mlp": 1.04939795, + "epoch": 0.7722200846479416, + "flos": 525661705728.0, + "grad_norm": 0.05840576732821659, + "language_loss": 0.79845583, + "learning_rate": 0.00013000209232605071, + "loss": 0.80909157, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.1418457, + "step": 4014, + "time_per_iteration": 2.687988519668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069879, + "balance_loss_mlp": 1.05541873, + "epoch": 0.772412466333205, + "flos": 479598216192.0, + "grad_norm": 0.07708984464094068, + "language_loss": 0.79761243, + "learning_rate": 0.0001297926172306519, + "loss": 0.80831122, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.14440918, + "step": 4015, + "time_per_iteration": 2.691276788711548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071175, + "balance_loss_mlp": 1.05634522, + "epoch": 0.7726048480184686, + "flos": 905688801792.0, + "grad_norm": 0.0617812543483069, + "language_loss": 0.78855252, + "learning_rate": 0.0001295832858625055, + "loss": 0.79926431, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.14807129, + "step": 4016, + "time_per_iteration": 3.2806596755981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072517, + "balance_loss_mlp": 1.05814075, + "epoch": 0.7727972297037322, + "flos": 631380801024.0, + "grad_norm": 0.10339069481740779, + "language_loss": 0.69680643, + "learning_rate": 0.00012937409830288154, + "loss": 0.70753151, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.14367676, + "step": 4017, + "time_per_iteration": 2.863893508911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075198, + "balance_loss_mlp": 1.0609529, + "epoch": 0.7729896113889958, + "flos": 414786147840.0, + "grad_norm": 0.0799045104942487, + "language_loss": 0.85132849, + "learning_rate": 0.00012916505463299362, + "loss": 0.86208045, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.14233398, + "step": 4018, + "time_per_iteration": 2.532130002975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073187, + "balance_loss_mlp": 1.05844092, + "epoch": 0.7731819930742593, + "flos": 668907694080.0, + "grad_norm": 0.09414519746136404, + "language_loss": 0.77866244, + "learning_rate": 0.00012895615493399972, + "loss": 0.78939426, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.14733887, + "step": 4019, + "time_per_iteration": 2.839327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073965, + "balance_loss_mlp": 1.05939734, + "epoch": 0.7733743747595229, + "flos": 489854615040.0, + "grad_norm": 0.14078532910338418, + "language_loss": 0.82467055, + "learning_rate": 0.00012874739928700192, + "loss": 0.83541024, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.14562988, + "step": 4020, + "time_per_iteration": 2.596458911895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066633, + "balance_loss_mlp": 1.05195868, + "epoch": 0.7735667564447865, + "flos": 659612325888.0, + "grad_norm": 0.07681934826455636, + "language_loss": 0.79637134, + "learning_rate": 0.00012853878777304624, + "loss": 0.80703765, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.14660645, + "step": 4021, + "time_per_iteration": 2.881782054901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080593, + "balance_loss_mlp": 1.0660851, + "epoch": 0.77375913813005, + "flos": 533383004160.0, + "grad_norm": 0.05945562457109584, + "language_loss": 0.84455419, + "learning_rate": 0.000128330320473123, + "loss": 0.85536003, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.14489746, + "step": 4022, + "time_per_iteration": 2.7595038414001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102275, + "balance_loss_mlp": 1.01607394, + "epoch": 0.7739515198153136, + "flos": 1520081925120.0, + "grad_norm": 0.012779532981729017, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.7935465, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.06689453, + "step": 4023, + "time_per_iteration": 4.909268379211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079749, + "balance_loss_mlp": 1.06491959, + "epoch": 0.7741439015005771, + "flos": 640105348608.0, + "grad_norm": 0.0771739695841244, + "language_loss": 0.81660879, + "learning_rate": 0.0001279138188390543, + "loss": 0.82740629, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.14807129, + "step": 4024, + "time_per_iteration": 2.8041296005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072864, + "balance_loss_mlp": 1.05869019, + "epoch": 0.7743362831858407, + "flos": 665841420288.0, + "grad_norm": 0.05641860086988057, + "language_loss": 0.86285681, + "learning_rate": 0.00012770578466660915, + "loss": 0.87358546, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.1418457, + "step": 4025, + "time_per_iteration": 2.8959219455718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076822, + "balance_loss_mlp": 1.06208789, + "epoch": 0.7745286648711043, + "flos": 562760939520.0, + "grad_norm": 0.06540295848056549, + "language_loss": 0.8125031, + "learning_rate": 0.0001274978950315968, + "loss": 0.82327133, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.14709473, + "step": 4026, + "time_per_iteration": 2.8482625484466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078061, + "balance_loss_mlp": 1.06332707, + "epoch": 0.7747210465563679, + "flos": 516912565248.0, + "grad_norm": 0.20054905129576697, + "language_loss": 0.83055073, + "learning_rate": 0.00012729015001472716, + "loss": 0.84133136, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.14709473, + "step": 4027, + "time_per_iteration": 2.660585641860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076692, + "balance_loss_mlp": 1.06191039, + "epoch": 0.7749134282416313, + "flos": 634209937920.0, + "grad_norm": 0.06859872536525731, + "language_loss": 0.81346893, + "learning_rate": 0.00012708254969665418, + "loss": 0.82423586, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.14770508, + "step": 4028, + "time_per_iteration": 2.755984306335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07344401, + "epoch": 0.7751058099268949, + "flos": 495364584960.0, + "grad_norm": 0.13856653823900703, + "language_loss": 0.83200014, + "learning_rate": 0.00012687509415797526, + "loss": 0.84287679, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.14208984, + "step": 4029, + "time_per_iteration": 2.605494976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081118, + "balance_loss_mlp": 1.0666815, + "epoch": 0.7752981916121585, + "flos": 510310513152.0, + "grad_norm": 0.07842880902840609, + "language_loss": 0.81172287, + "learning_rate": 0.00012666778347923208, + "loss": 0.82253402, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.14428711, + "step": 4030, + "time_per_iteration": 2.7449951171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082077, + "balance_loss_mlp": 1.06805801, + "epoch": 0.7754905732974221, + "flos": 497548749312.0, + "grad_norm": 0.06532931928318482, + "language_loss": 0.83712244, + "learning_rate": 0.0001264606177409092, + "loss": 0.84794319, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.14025879, + "step": 4031, + "time_per_iteration": 2.654155731201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078609, + "balance_loss_mlp": 1.06400609, + "epoch": 0.7756829549826857, + "flos": 480744626688.0, + "grad_norm": 0.0713548804544701, + "language_loss": 0.85789335, + "learning_rate": 0.00012625359702343609, + "loss": 0.8686794, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.14587402, + "step": 4032, + "time_per_iteration": 2.7373740673065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082548, + "balance_loss_mlp": 1.06805241, + "epoch": 0.7758753366679492, + "flos": 552630822912.0, + "grad_norm": 0.0791790150360774, + "language_loss": 0.85047174, + "learning_rate": 0.00012604672140718504, + "loss": 0.86129719, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.14477539, + "step": 4033, + "time_per_iteration": 2.668175458908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080956, + "balance_loss_mlp": 1.06637716, + "epoch": 0.7760677183532128, + "flos": 703835246592.0, + "grad_norm": 0.07723618035989119, + "language_loss": 0.77780712, + "learning_rate": 0.00012583999097247233, + "loss": 0.78861672, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.14562988, + "step": 4034, + "time_per_iteration": 2.882200241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082263, + "balance_loss_mlp": 1.06798124, + "epoch": 0.7762601000384763, + "flos": 523470200832.0, + "grad_norm": 0.07383461376071596, + "language_loss": 0.79689777, + "learning_rate": 0.0001256334057995578, + "loss": 0.80772036, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.1427002, + "step": 4035, + "time_per_iteration": 2.690647602081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080186, + "balance_loss_mlp": 1.06548703, + "epoch": 0.7764524817237399, + "flos": 557532896256.0, + "grad_norm": 0.06987700123133081, + "language_loss": 0.84825015, + "learning_rate": 0.000125426965968645, + "loss": 0.85905206, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.14672852, + "step": 4036, + "time_per_iteration": 2.7032387256622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077648, + "balance_loss_mlp": 1.06333125, + "epoch": 0.7766448634090035, + "flos": 579725849088.0, + "grad_norm": 0.07584247784389492, + "language_loss": 0.82193661, + "learning_rate": 0.00012522067155988092, + "loss": 0.83271313, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.14306641, + "step": 4037, + "time_per_iteration": 2.6950039863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107402, + "balance_loss_mlp": 1.05985785, + "epoch": 0.776837245094267, + "flos": 635603397120.0, + "grad_norm": 0.09463172891349511, + "language_loss": 0.75239801, + "learning_rate": 0.00012501452265335617, + "loss": 0.76313818, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.14160156, + "step": 4038, + "time_per_iteration": 2.8472111225128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107878, + "balance_loss_mlp": 1.06440353, + "epoch": 0.7770296267795306, + "flos": 614680565760.0, + "grad_norm": 0.06689469876162565, + "language_loss": 0.82871956, + "learning_rate": 0.0001248085193291047, + "loss": 0.83950734, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.14367676, + "step": 4039, + "time_per_iteration": 2.750570774078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078863, + "balance_loss_mlp": 1.06394982, + "epoch": 0.7772220084647942, + "flos": 878808890880.0, + "grad_norm": 0.07053894567576345, + "language_loss": 0.82192504, + "learning_rate": 0.00012460266166710443, + "loss": 0.83271372, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.14904785, + "step": 4040, + "time_per_iteration": 3.2112436294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072892, + "balance_loss_mlp": 1.05816936, + "epoch": 0.7774143901500578, + "flos": 839641489920.0, + "grad_norm": 0.07497892804432345, + "language_loss": 0.77567667, + "learning_rate": 0.00012439694974727633, + "loss": 0.78640562, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.14709473, + "step": 4041, + "time_per_iteration": 3.0245847702026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073829, + "balance_loss_mlp": 1.05928516, + "epoch": 0.7776067718353212, + "flos": 568147571712.0, + "grad_norm": 0.060778076855285974, + "language_loss": 0.79776394, + "learning_rate": 0.00012419138364948458, + "loss": 0.8085022, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.14538574, + "step": 4042, + "time_per_iteration": 2.7336411476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066625, + "balance_loss_mlp": 1.05183077, + "epoch": 0.7777991535205848, + "flos": 745943012352.0, + "grad_norm": 0.06729651648033357, + "language_loss": 0.82573462, + "learning_rate": 0.00012398596345353702, + "loss": 0.83640087, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.14770508, + "step": 4043, + "time_per_iteration": 2.888540029525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072811, + "balance_loss_mlp": 1.05832708, + "epoch": 0.7779915352058484, + "flos": 538075104768.0, + "grad_norm": 0.06360284463986167, + "language_loss": 0.83389121, + "learning_rate": 0.0001237806892391851, + "loss": 0.84461933, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.14489746, + "step": 4044, + "time_per_iteration": 2.745943546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.05827904, + "epoch": 0.778183916891112, + "flos": 634788099072.0, + "grad_norm": 0.07557014389374586, + "language_loss": 0.80569065, + "learning_rate": 0.0001235755610861233, + "loss": 0.81642056, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.14685059, + "step": 4045, + "time_per_iteration": 2.8391878604888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073322, + "balance_loss_mlp": 1.05836082, + "epoch": 0.7783762985763756, + "flos": 588677621760.0, + "grad_norm": 0.08271633587976211, + "language_loss": 0.84886134, + "learning_rate": 0.0001233705790739893, + "loss": 0.85959458, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.14941406, + "step": 4046, + "time_per_iteration": 2.7301955223083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070585, + "balance_loss_mlp": 1.05619669, + "epoch": 0.7785686802616391, + "flos": 930656563200.0, + "grad_norm": 0.07709409005439366, + "language_loss": 0.75105876, + "learning_rate": 0.0001231657432823643, + "loss": 0.76176465, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.14379883, + "step": 4047, + "time_per_iteration": 3.2447426319122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072293, + "balance_loss_mlp": 1.05745101, + "epoch": 0.7787610619469026, + "flos": 497934190080.0, + "grad_norm": 0.08319783109308485, + "language_loss": 0.78652561, + "learning_rate": 0.0001229610537907725, + "loss": 0.79724848, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.14819336, + "step": 4048, + "time_per_iteration": 2.6655571460723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071474, + "balance_loss_mlp": 1.05683541, + "epoch": 0.7789534436321662, + "flos": 515637674496.0, + "grad_norm": 0.1398744785443317, + "language_loss": 0.90141088, + "learning_rate": 0.00012275651067868143, + "loss": 0.91212559, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.14624023, + "step": 4049, + "time_per_iteration": 2.660238265991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072645, + "balance_loss_mlp": 1.05813682, + "epoch": 0.7791458253174298, + "flos": 988476369408.0, + "grad_norm": 0.057540222114583386, + "language_loss": 0.8025769, + "learning_rate": 0.00012255211402550182, + "loss": 0.81330329, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.14477539, + "step": 4050, + "time_per_iteration": 3.233478546142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073902, + "balance_loss_mlp": 1.05907226, + "epoch": 0.7793382070026933, + "flos": 629040992256.0, + "grad_norm": 0.08623870329629198, + "language_loss": 0.76389378, + "learning_rate": 0.00012234786391058727, + "loss": 0.77463281, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.14819336, + "step": 4051, + "time_per_iteration": 2.771480083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078975, + "balance_loss_mlp": 1.06439614, + "epoch": 0.7795305886879569, + "flos": 531752408064.0, + "grad_norm": 0.08444624617327998, + "language_loss": 0.84906709, + "learning_rate": 0.0001221437604132352, + "loss": 0.85985684, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.14575195, + "step": 4052, + "time_per_iteration": 2.6185004711151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074724, + "balance_loss_mlp": 1.05997825, + "epoch": 0.7797229703732205, + "flos": 611979909120.0, + "grad_norm": 0.08471537445823431, + "language_loss": 0.8108837, + "learning_rate": 0.0001219398036126852, + "loss": 0.82163101, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.1472168, + "step": 4053, + "time_per_iteration": 4.269315004348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078474, + "balance_loss_mlp": 1.06376374, + "epoch": 0.7799153520584841, + "flos": 872164620288.0, + "grad_norm": 0.0665397662082905, + "language_loss": 0.78063762, + "learning_rate": 0.00012173599358812027, + "loss": 0.79142237, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.14685059, + "step": 4054, + "time_per_iteration": 3.3110597133636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082538, + "balance_loss_mlp": 1.06795871, + "epoch": 0.7801077337437476, + "flos": 583627244544.0, + "grad_norm": 0.07210936675879133, + "language_loss": 0.8279568, + "learning_rate": 0.0001215323304186668, + "loss": 0.83878219, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.14575195, + "step": 4055, + "time_per_iteration": 2.8330674171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086694, + "balance_loss_mlp": 1.07248473, + "epoch": 0.7803001154290111, + "flos": 601165172736.0, + "grad_norm": 0.07177144261981991, + "language_loss": 0.87391448, + "learning_rate": 0.00012132881418339364, + "loss": 0.88478148, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.14196777, + "step": 4056, + "time_per_iteration": 2.776947259902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_mlp": 1.02952027, + "epoch": 0.7804924971142747, + "flos": 1479577591296.0, + "grad_norm": 0.02528916030641435, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78553808, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.06738281, + "step": 4057, + "time_per_iteration": 4.90228271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083986, + "balance_loss_mlp": 1.06952608, + "epoch": 0.7806848787995383, + "flos": 630362870784.0, + "grad_norm": 0.06952403466648098, + "language_loss": 0.76993859, + "learning_rate": 0.00012092222283137944, + "loss": 0.78077847, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.14453125, + "step": 4058, + "time_per_iteration": 2.8027281761169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_mlp": 1.02595437, + "epoch": 0.7808772604848019, + "flos": 1417587319296.0, + "grad_norm": 0.023618595086734803, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79938942, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.06689453, + "step": 4059, + "time_per_iteration": 4.777599811553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086286, + "balance_loss_mlp": 1.07214832, + "epoch": 0.7810696421700654, + "flos": 731696011776.0, + "grad_norm": 0.06473679884808177, + "language_loss": 0.83483815, + "learning_rate": 0.00012051622016348856, + "loss": 0.84570104, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.14135742, + "step": 4060, + "time_per_iteration": 3.0805013179779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082313, + "balance_loss_mlp": 1.0681268, + "epoch": 0.781262023855329, + "flos": 424941230592.0, + "grad_norm": 0.07665957086955441, + "language_loss": 0.84603846, + "learning_rate": 0.00012031343978315539, + "loss": 0.85686159, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.14208984, + "step": 4061, + "time_per_iteration": 2.509371280670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079981, + "balance_loss_mlp": 1.06562829, + "epoch": 0.7814544055405925, + "flos": 501027628032.0, + "grad_norm": 0.11716755196941751, + "language_loss": 0.82515299, + "learning_rate": 0.00012011080681021774, + "loss": 0.83595276, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.14355469, + "step": 4062, + "time_per_iteration": 2.653513193130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084404, + "balance_loss_mlp": 1.0701232, + "epoch": 0.7816467872258561, + "flos": 462448300032.0, + "grad_norm": 0.06950633997018366, + "language_loss": 0.86157346, + "learning_rate": 0.00011990832132334512, + "loss": 0.87241757, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.14282227, + "step": 4063, + "time_per_iteration": 2.5633385181427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.06134164, + "epoch": 0.7818391689111197, + "flos": 740818483200.0, + "grad_norm": 0.08193675337903113, + "language_loss": 0.82464862, + "learning_rate": 0.00011970598340114897, + "loss": 0.83540839, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.14624023, + "step": 4064, + "time_per_iteration": 2.978691339492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075693, + "balance_loss_mlp": 1.06104183, + "epoch": 0.7820315505963832, + "flos": 547669278720.0, + "grad_norm": 0.07485411860694487, + "language_loss": 0.84175539, + "learning_rate": 0.00011950379312218396, + "loss": 0.85251236, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.1463623, + "step": 4065, + "time_per_iteration": 2.743990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077394, + "balance_loss_mlp": 1.06274307, + "epoch": 0.7822239322816468, + "flos": 728983245312.0, + "grad_norm": 0.06405873824193194, + "language_loss": 0.86273229, + "learning_rate": 0.00011930175056494719, + "loss": 0.87350619, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.1463623, + "step": 4066, + "time_per_iteration": 2.880624532699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077235, + "balance_loss_mlp": 1.06242979, + "epoch": 0.7824163139669104, + "flos": 452016433152.0, + "grad_norm": 0.05775885887204321, + "language_loss": 0.75816822, + "learning_rate": 0.00011909985580787885, + "loss": 0.76894057, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.14794922, + "step": 4067, + "time_per_iteration": 2.6789603233337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071164, + "balance_loss_mlp": 1.05672777, + "epoch": 0.782608695652174, + "flos": 540489065472.0, + "grad_norm": 0.06284042088337013, + "language_loss": 0.81289232, + "learning_rate": 0.00011889810892936137, + "loss": 0.82360399, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.14428711, + "step": 4068, + "time_per_iteration": 2.7376155853271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107855, + "balance_loss_mlp": 1.0636723, + "epoch": 0.7828010773374374, + "flos": 500308503552.0, + "grad_norm": 0.07211764568585548, + "language_loss": 0.77206087, + "learning_rate": 0.00011869651000771959, + "loss": 0.78284639, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.1484375, + "step": 4069, + "time_per_iteration": 2.85400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076002, + "balance_loss_mlp": 1.06167328, + "epoch": 0.782993459022701, + "flos": 600816807936.0, + "grad_norm": 0.06878922071462945, + "language_loss": 0.82603711, + "learning_rate": 0.00011849505912122117, + "loss": 0.83679712, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.14318848, + "step": 4070, + "time_per_iteration": 2.7624692916870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073656, + "balance_loss_mlp": 1.05892146, + "epoch": 0.7831858407079646, + "flos": 810055779840.0, + "grad_norm": 0.07956596885242023, + "language_loss": 0.77556145, + "learning_rate": 0.00011829375634807654, + "loss": 0.78629792, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.1472168, + "step": 4071, + "time_per_iteration": 3.049309015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107043, + "balance_loss_mlp": 1.05580282, + "epoch": 0.7833782223932282, + "flos": 806594153472.0, + "grad_norm": 0.06202372733379216, + "language_loss": 0.81076932, + "learning_rate": 0.00011809260176643821, + "loss": 0.8214736, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.14599609, + "step": 4072, + "time_per_iteration": 3.1130549907684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078344, + "balance_loss_mlp": 1.06361008, + "epoch": 0.7835706040784918, + "flos": 520870860288.0, + "grad_norm": 0.09346858465920099, + "language_loss": 0.8374989, + "learning_rate": 0.00011789159545440131, + "loss": 0.84828234, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.14709473, + "step": 4073, + "time_per_iteration": 2.602320909500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078791, + "balance_loss_mlp": 1.06415224, + "epoch": 0.7837629857637552, + "flos": 505605929472.0, + "grad_norm": 0.05972390006809772, + "language_loss": 0.82226318, + "learning_rate": 0.00011769073749000348, + "loss": 0.83305109, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.14624023, + "step": 4074, + "time_per_iteration": 2.808209180831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073655, + "balance_loss_mlp": 1.05863476, + "epoch": 0.7839553674490188, + "flos": 516124431360.0, + "grad_norm": 0.07654822169545344, + "language_loss": 0.76011252, + "learning_rate": 0.0001174900279512246, + "loss": 0.77084911, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.14990234, + "step": 4075, + "time_per_iteration": 2.6128828525543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071701, + "balance_loss_mlp": 1.05739617, + "epoch": 0.7841477491342824, + "flos": 506648825856.0, + "grad_norm": 0.06466128589052426, + "language_loss": 0.81886286, + "learning_rate": 0.00011728946691598707, + "loss": 0.82957983, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.14318848, + "step": 4076, + "time_per_iteration": 2.660953998565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078769, + "balance_loss_mlp": 1.06449986, + "epoch": 0.784340130819546, + "flos": 719636120064.0, + "grad_norm": 0.09310549739723947, + "language_loss": 0.76184124, + "learning_rate": 0.00011708905446215561, + "loss": 0.77262896, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.1427002, + "step": 4077, + "time_per_iteration": 2.8871099948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072579, + "balance_loss_mlp": 1.05803514, + "epoch": 0.7845325125048095, + "flos": 514441704960.0, + "grad_norm": 0.06079440348855826, + "language_loss": 0.80103385, + "learning_rate": 0.00011688879066753711, + "loss": 0.81175959, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.14526367, + "step": 4078, + "time_per_iteration": 2.7004237174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075184, + "balance_loss_mlp": 1.06087887, + "epoch": 0.7847248941900731, + "flos": 466102646784.0, + "grad_norm": 0.08023192090613442, + "language_loss": 0.87211287, + "learning_rate": 0.00011668867560988122, + "loss": 0.88286471, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.14294434, + "step": 4079, + "time_per_iteration": 2.6138765811920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079594, + "balance_loss_mlp": 1.06459749, + "epoch": 0.7849172758753367, + "flos": 503028983808.0, + "grad_norm": 0.07587541015250795, + "language_loss": 0.84325242, + "learning_rate": 0.00011648870936687916, + "loss": 0.85404837, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.14978027, + "step": 4080, + "time_per_iteration": 2.829251766204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074534, + "balance_loss_mlp": 1.05989528, + "epoch": 0.7851096575606002, + "flos": 531999456768.0, + "grad_norm": 0.11404502533109409, + "language_loss": 0.78844041, + "learning_rate": 0.00011628889201616461, + "loss": 0.79918575, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.1463623, + "step": 4081, + "time_per_iteration": 2.6469521522521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073576, + "balance_loss_mlp": 1.05934227, + "epoch": 0.7853020392458638, + "flos": 569956207104.0, + "grad_norm": 0.07608494158988048, + "language_loss": 0.82050377, + "learning_rate": 0.00011608922363531393, + "loss": 0.83123952, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.14245605, + "step": 4082, + "time_per_iteration": 2.692795753479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075189, + "balance_loss_mlp": 1.06111002, + "epoch": 0.7854944209311273, + "flos": 832579845120.0, + "grad_norm": 0.08462347153699132, + "language_loss": 0.83413076, + "learning_rate": 0.00011588970430184504, + "loss": 0.84488267, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.14086914, + "step": 4083, + "time_per_iteration": 3.1208105087280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072687, + "balance_loss_mlp": 1.05856121, + "epoch": 0.7856868026163909, + "flos": 559929604608.0, + "grad_norm": 0.07095149348984836, + "language_loss": 0.81742346, + "learning_rate": 0.00011569033409321822, + "loss": 0.82815039, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.14135742, + "step": 4084, + "time_per_iteration": 2.7347347736358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068027, + "balance_loss_mlp": 1.05343556, + "epoch": 0.7858791843016545, + "flos": 545230725120.0, + "grad_norm": 0.07957990529540243, + "language_loss": 0.73091239, + "learning_rate": 0.00011549111308683591, + "loss": 0.74159265, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.14587402, + "step": 4085, + "time_per_iteration": 2.7169110774993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_mlp": 1.05705309, + "epoch": 0.7860715659869181, + "flos": 380997665280.0, + "grad_norm": 0.09071290102640983, + "language_loss": 0.80941343, + "learning_rate": 0.00011529204136004251, + "loss": 0.8201263, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.14233398, + "step": 4086, + "time_per_iteration": 2.452552318572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066539, + "balance_loss_mlp": 1.05191231, + "epoch": 0.7862639476721817, + "flos": 567440930304.0, + "grad_norm": 0.05875076882668594, + "language_loss": 0.84497392, + "learning_rate": 0.00011509311899012459, + "loss": 0.85563934, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.14624023, + "step": 4087, + "time_per_iteration": 2.717156410217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072502, + "balance_loss_mlp": 1.05781543, + "epoch": 0.7864563293574451, + "flos": 545238065664.0, + "grad_norm": 0.09582325425007773, + "language_loss": 0.78019136, + "learning_rate": 0.00011489434605431053, + "loss": 0.79091644, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.14672852, + "step": 4088, + "time_per_iteration": 2.6889476776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065898, + "balance_loss_mlp": 1.05135465, + "epoch": 0.7866487110427087, + "flos": 563536963584.0, + "grad_norm": 0.07016527238188626, + "language_loss": 0.81085324, + "learning_rate": 0.0001146957226297708, + "loss": 0.82151222, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.14526367, + "step": 4089, + "time_per_iteration": 2.727022647857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070017, + "balance_loss_mlp": 1.05554497, + "epoch": 0.7868410927279723, + "flos": 728189968896.0, + "grad_norm": 0.08533113133407452, + "language_loss": 0.76128238, + "learning_rate": 0.00011449724879361827, + "loss": 0.77198255, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.14453125, + "step": 4090, + "time_per_iteration": 3.0626373291015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073737, + "balance_loss_mlp": 1.05938458, + "epoch": 0.7870334744132359, + "flos": 521355045888.0, + "grad_norm": 0.09697336218432462, + "language_loss": 0.7367081, + "learning_rate": 0.00011429892462290687, + "loss": 0.74744546, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.14343262, + "step": 4091, + "time_per_iteration": 2.688397169113159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066999, + "balance_loss_mlp": 1.05259871, + "epoch": 0.7872258560984994, + "flos": 451411107840.0, + "grad_norm": 0.06809709972371855, + "language_loss": 0.83140373, + "learning_rate": 0.00011410075019463295, + "loss": 0.84207374, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.1439209, + "step": 4092, + "time_per_iteration": 2.667365789413452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064434, + "balance_loss_mlp": 1.04979479, + "epoch": 0.787418237783763, + "flos": 515195334144.0, + "grad_norm": 0.0662823120947489, + "language_loss": 0.79980755, + "learning_rate": 0.00011390272558573461, + "loss": 0.81045187, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.1463623, + "step": 4093, + "time_per_iteration": 2.7487874031066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063058, + "balance_loss_mlp": 1.04871678, + "epoch": 0.7876106194690266, + "flos": 485081021952.0, + "grad_norm": 0.07241506189294278, + "language_loss": 0.80018187, + "learning_rate": 0.00011370485087309202, + "loss": 0.81081247, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.14343262, + "step": 4094, + "time_per_iteration": 2.6645123958587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063246, + "balance_loss_mlp": 1.04858303, + "epoch": 0.7878030011542901, + "flos": 542841357312.0, + "grad_norm": 0.07706414391638888, + "language_loss": 0.79288125, + "learning_rate": 0.00011350712613352688, + "loss": 0.80351365, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.1463623, + "step": 4095, + "time_per_iteration": 2.700049877166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060614, + "balance_loss_mlp": 1.0458796, + "epoch": 0.7879953828395537, + "flos": 516739668480.0, + "grad_norm": 0.0878043495750585, + "language_loss": 0.79471409, + "learning_rate": 0.00011330955144380283, + "loss": 0.80532026, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.14733887, + "step": 4096, + "time_per_iteration": 2.652745008468628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106538, + "balance_loss_mlp": 1.0502882, + "epoch": 0.7881877645248172, + "flos": 582278201856.0, + "grad_norm": 0.08295045554320525, + "language_loss": 0.85968649, + "learning_rate": 0.00011311212688062483, + "loss": 0.87034023, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.15063477, + "step": 4097, + "time_per_iteration": 2.860481023788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062768, + "balance_loss_mlp": 1.04778373, + "epoch": 0.7883801462100808, + "flos": 589171719168.0, + "grad_norm": 0.08289312695855233, + "language_loss": 0.77912939, + "learning_rate": 0.0001129148525206402, + "loss": 0.78975713, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.14953613, + "step": 4098, + "time_per_iteration": 2.8443920612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061859, + "balance_loss_mlp": 1.04729128, + "epoch": 0.7885725278953444, + "flos": 481728052224.0, + "grad_norm": 0.07565956052784888, + "language_loss": 0.86410785, + "learning_rate": 0.00011271772844043759, + "loss": 0.87472647, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.14562988, + "step": 4099, + "time_per_iteration": 2.67754864692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_mlp": 1.04616427, + "epoch": 0.788764909580608, + "flos": 756794824704.0, + "grad_norm": 0.08256938816600788, + "language_loss": 0.76203871, + "learning_rate": 0.00011252075471654727, + "loss": 0.77265191, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.15136719, + "step": 4100, + "time_per_iteration": 2.9445242881774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062896, + "balance_loss_mlp": 1.04794669, + "epoch": 0.7889572912658714, + "flos": 702555213312.0, + "grad_norm": 0.06872757551446003, + "language_loss": 0.77701616, + "learning_rate": 0.00011232393142544133, + "loss": 0.7876451, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.14929199, + "step": 4101, + "time_per_iteration": 2.9418632984161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060972, + "balance_loss_mlp": 1.04578507, + "epoch": 0.789149672951135, + "flos": 736405364736.0, + "grad_norm": 0.0823367955958929, + "language_loss": 0.82776141, + "learning_rate": 0.00011212725864353323, + "loss": 0.83837116, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.15161133, + "step": 4102, + "time_per_iteration": 3.066218614578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009146, + "balance_loss_mlp": 1.002756, + "epoch": 0.7893420546363986, + "flos": 1481396511744.0, + "grad_norm": 0.00970990136946143, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77345079, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.06396484, + "step": 4103, + "time_per_iteration": 4.897639036178589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068091, + "balance_loss_mlp": 1.0529511, + "epoch": 0.7895344363216622, + "flos": 509072698368.0, + "grad_norm": 0.08329351881420698, + "language_loss": 0.75839722, + "learning_rate": 0.00011173436491267291, + "loss": 0.76907814, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.15148926, + "step": 4104, + "time_per_iteration": 2.572232484817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069348, + "balance_loss_mlp": 1.05428004, + "epoch": 0.7897268180069258, + "flos": 541988983296.0, + "grad_norm": 0.07889516146695053, + "language_loss": 0.81743544, + "learning_rate": 0.0001115381441162554, + "loss": 0.82812893, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.1505127, + "step": 4105, + "time_per_iteration": 2.6332814693450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010093, + "balance_loss_mlp": 1.00375092, + "epoch": 0.7899191996921893, + "flos": 1412687817216.0, + "grad_norm": 0.008847709876927975, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74593818, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.06347656, + "step": 4106, + "time_per_iteration": 4.895688533782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066556, + "balance_loss_mlp": 1.05182195, + "epoch": 0.7901115813774529, + "flos": 622841633280.0, + "grad_norm": 0.06549029266923186, + "language_loss": 0.85235715, + "learning_rate": 0.00011114615504234465, + "loss": 0.86302269, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.1472168, + "step": 4107, + "time_per_iteration": 2.799600839614868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064328, + "balance_loss_mlp": 1.04949808, + "epoch": 0.7903039630627164, + "flos": 645545935872.0, + "grad_norm": 0.08208418526226827, + "language_loss": 0.80641502, + "learning_rate": 0.00011095038691703468, + "loss": 0.81705832, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.14819336, + "step": 4108, + "time_per_iteration": 2.877985715866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062498, + "balance_loss_mlp": 1.0479306, + "epoch": 0.79049634474798, + "flos": 594365257728.0, + "grad_norm": 0.09971015959330254, + "language_loss": 0.82810932, + "learning_rate": 0.00011075476983417998, + "loss": 0.83873427, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.14550781, + "step": 4109, + "time_per_iteration": 2.881120204925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106703, + "balance_loss_mlp": 1.05190194, + "epoch": 0.7906887264332435, + "flos": 716093001216.0, + "grad_norm": 0.08829657837561553, + "language_loss": 0.77800107, + "learning_rate": 0.00011055930386972579, + "loss": 0.78867137, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.15112305, + "step": 4110, + "time_per_iteration": 2.8346822261810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068152, + "balance_loss_mlp": 1.05334675, + "epoch": 0.7908811081185071, + "flos": 789893918208.0, + "grad_norm": 0.07023814842259256, + "language_loss": 0.78629267, + "learning_rate": 0.00011036398909955863, + "loss": 0.79697418, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.14794922, + "step": 4111, + "time_per_iteration": 2.9915273189544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072098, + "balance_loss_mlp": 1.05748332, + "epoch": 0.7910734898037707, + "flos": 641904072192.0, + "grad_norm": 0.06852892596590886, + "language_loss": 0.81336486, + "learning_rate": 0.00011016882559950648, + "loss": 0.82408583, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.14599609, + "step": 4112, + "time_per_iteration": 2.83972430229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106327, + "balance_loss_mlp": 1.04859507, + "epoch": 0.7912658714890343, + "flos": 669357374976.0, + "grad_norm": 0.0738063160504073, + "language_loss": 0.80238831, + "learning_rate": 0.00010997381344533853, + "loss": 0.813021, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.14648438, + "step": 4113, + "time_per_iteration": 2.7973837852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073593, + "balance_loss_mlp": 1.0586915, + "epoch": 0.7914582531742979, + "flos": 557779944960.0, + "grad_norm": 0.07609152386132986, + "language_loss": 0.80731696, + "learning_rate": 0.00010977895271276517, + "loss": 0.81805289, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.14892578, + "step": 4114, + "time_per_iteration": 2.7021210193634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106794, + "balance_loss_mlp": 1.05307484, + "epoch": 0.7916506348595613, + "flos": 570064863744.0, + "grad_norm": 0.06963604008344469, + "language_loss": 0.79982167, + "learning_rate": 0.00010958424347743807, + "loss": 0.8105011, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.14831543, + "step": 4115, + "time_per_iteration": 2.722219228744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077261, + "balance_loss_mlp": 1.06282425, + "epoch": 0.7918430165448249, + "flos": 718301758464.0, + "grad_norm": 0.06932829196563554, + "language_loss": 0.80035752, + "learning_rate": 0.00010938968581494991, + "loss": 0.81113005, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.14440918, + "step": 4116, + "time_per_iteration": 3.020597457885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107207, + "balance_loss_mlp": 1.05728841, + "epoch": 0.7920353982300885, + "flos": 553648753152.0, + "grad_norm": 0.08602194395595932, + "language_loss": 0.79036731, + "learning_rate": 0.000109195279800835, + "loss": 0.80108798, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.14758301, + "step": 4117, + "time_per_iteration": 2.752718210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068929, + "balance_loss_mlp": 1.05411148, + "epoch": 0.7922277799153521, + "flos": 810120019968.0, + "grad_norm": 0.08368902662154773, + "language_loss": 0.76681507, + "learning_rate": 0.00010900102551056834, + "loss": 0.77750438, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.14794922, + "step": 4118, + "time_per_iteration": 3.036240816116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066229, + "balance_loss_mlp": 1.05164957, + "epoch": 0.7924201616006156, + "flos": 421351123968.0, + "grad_norm": 0.07604531563776018, + "language_loss": 0.84288156, + "learning_rate": 0.00010880692301956601, + "loss": 0.85354388, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.14550781, + "step": 4119, + "time_per_iteration": 2.493804693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074792, + "balance_loss_mlp": 1.05971193, + "epoch": 0.7926125432858792, + "flos": 617852924928.0, + "grad_norm": 0.06651444124896129, + "language_loss": 0.86047828, + "learning_rate": 0.00010861297240318518, + "loss": 0.87122619, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.1505127, + "step": 4120, + "time_per_iteration": 2.934854030609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067653, + "balance_loss_mlp": 1.05324042, + "epoch": 0.7928049249711427, + "flos": 602487051264.0, + "grad_norm": 0.07241093769806302, + "language_loss": 0.86881423, + "learning_rate": 0.00010841917373672444, + "loss": 0.87949073, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.1439209, + "step": 4121, + "time_per_iteration": 2.7358059883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071172, + "balance_loss_mlp": 1.0565567, + "epoch": 0.7929973066564063, + "flos": 656024790528.0, + "grad_norm": 0.08053859190471425, + "language_loss": 0.78445637, + "learning_rate": 0.00010822552709542293, + "loss": 0.79516816, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.14599609, + "step": 4122, + "time_per_iteration": 2.8181402683258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071522, + "balance_loss_mlp": 1.05708575, + "epoch": 0.7931896883416699, + "flos": 536397520896.0, + "grad_norm": 0.07023161642485896, + "language_loss": 0.85994995, + "learning_rate": 0.0001080320325544612, + "loss": 0.87066519, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.14428711, + "step": 4123, + "time_per_iteration": 2.666490316390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070673, + "balance_loss_mlp": 1.05572367, + "epoch": 0.7933820700269334, + "flos": 498082493952.0, + "grad_norm": 0.1106860652376269, + "language_loss": 0.82816887, + "learning_rate": 0.00010783869018895997, + "loss": 0.83887559, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.14953613, + "step": 4124, + "time_per_iteration": 2.6471545696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071579, + "balance_loss_mlp": 1.0571543, + "epoch": 0.793574451712197, + "flos": 537472350720.0, + "grad_norm": 0.07283258484620453, + "language_loss": 0.84189153, + "learning_rate": 0.00010764550007398189, + "loss": 0.85260737, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.14416504, + "step": 4125, + "time_per_iteration": 2.6587061882019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067686, + "balance_loss_mlp": 1.05301166, + "epoch": 0.7937668333974606, + "flos": 488285687808.0, + "grad_norm": 0.13078671480405682, + "language_loss": 0.81167138, + "learning_rate": 0.00010745246228452982, + "loss": 0.82234824, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.14660645, + "step": 4126, + "time_per_iteration": 2.645451784133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071819, + "balance_loss_mlp": 1.05741882, + "epoch": 0.7939592150827242, + "flos": 527425924608.0, + "grad_norm": 0.07416949151547285, + "language_loss": 0.81678915, + "learning_rate": 0.00010725957689554771, + "loss": 0.82750738, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.14379883, + "step": 4127, + "time_per_iteration": 2.765888214111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068227, + "balance_loss_mlp": 1.05377841, + "epoch": 0.7941515967679876, + "flos": 541702287360.0, + "grad_norm": 0.059315040508318935, + "language_loss": 0.84973252, + "learning_rate": 0.00010706684398192013, + "loss": 0.86041474, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.14416504, + "step": 4128, + "time_per_iteration": 2.71177339553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073885, + "balance_loss_mlp": 1.0591861, + "epoch": 0.7943439784532512, + "flos": 518387516928.0, + "grad_norm": 0.07619386086866002, + "language_loss": 0.81954181, + "learning_rate": 0.00010687426361847313, + "loss": 0.83028066, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.14685059, + "step": 4129, + "time_per_iteration": 2.758657455444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106652, + "balance_loss_mlp": 1.05164242, + "epoch": 0.7945363601385148, + "flos": 509025710592.0, + "grad_norm": 0.07169416903857827, + "language_loss": 0.85882586, + "learning_rate": 0.00010668183587997254, + "loss": 0.86949104, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.14868164, + "step": 4130, + "time_per_iteration": 2.596605062484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067605, + "balance_loss_mlp": 1.05271626, + "epoch": 0.7947287418237784, + "flos": 651214121472.0, + "grad_norm": 0.08455978064709659, + "language_loss": 0.77324224, + "learning_rate": 0.0001064895608411256, + "loss": 0.78391826, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.14868164, + "step": 4131, + "time_per_iteration": 2.796661853790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066687, + "balance_loss_mlp": 1.05177402, + "epoch": 0.794921123509042, + "flos": 696054477312.0, + "grad_norm": 0.0694318073220064, + "language_loss": 0.80456048, + "learning_rate": 0.00010629743857657998, + "loss": 0.81522739, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.14880371, + "step": 4132, + "time_per_iteration": 2.909764289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012901, + "balance_loss_mlp": 1.00636816, + "epoch": 0.7951135051943055, + "flos": 1402942768128.0, + "grad_norm": 0.005332975437914604, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71611571, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.06542969, + "step": 4133, + "time_per_iteration": 4.588430166244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107322, + "balance_loss_mlp": 1.05867648, + "epoch": 0.795305886879569, + "flos": 810085515264.0, + "grad_norm": 0.06887248612226421, + "language_loss": 0.82134831, + "learning_rate": 0.00010591365266868802, + "loss": 0.83208048, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.14550781, + "step": 4134, + "time_per_iteration": 2.9644124507904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012906, + "balance_loss_mlp": 1.00632596, + "epoch": 0.7954982685648326, + "flos": 1426005347328.0, + "grad_norm": 0.005331901517009852, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76524687, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.06591797, + "step": 4135, + "time_per_iteration": 4.896401405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068705, + "balance_loss_mlp": 1.05358934, + "epoch": 0.7956906502500962, + "flos": 389885197824.0, + "grad_norm": 0.06724470827619233, + "language_loss": 0.79032838, + "learning_rate": 0.00010553047875229166, + "loss": 0.80101544, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.15100098, + "step": 4136, + "time_per_iteration": 2.5450961589813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066364, + "balance_loss_mlp": 1.05189192, + "epoch": 0.7958830319353598, + "flos": 515573434368.0, + "grad_norm": 0.07076357232689101, + "language_loss": 0.83468044, + "learning_rate": 0.00010533912147689328, + "loss": 0.84534407, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.14465332, + "step": 4137, + "time_per_iteration": 2.693084239959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070158, + "balance_loss_mlp": 1.05553102, + "epoch": 0.7960754136206233, + "flos": 493941390336.0, + "grad_norm": 0.06121658887981785, + "language_loss": 0.8226397, + "learning_rate": 0.00010514791742243656, + "loss": 0.8333413, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.14599609, + "step": 4138, + "time_per_iteration": 2.6134862899780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066373, + "balance_loss_mlp": 1.05163932, + "epoch": 0.7962677953058869, + "flos": 655728182784.0, + "grad_norm": 0.07416353979296561, + "language_loss": 0.82627141, + "learning_rate": 0.00010495686666315341, + "loss": 0.83693522, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.14733887, + "step": 4139, + "time_per_iteration": 2.8959176540374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071604, + "balance_loss_mlp": 1.05745411, + "epoch": 0.7964601769911505, + "flos": 542384335872.0, + "grad_norm": 0.08076949744760686, + "language_loss": 0.77108532, + "learning_rate": 0.00010476596927321635, + "loss": 0.78180134, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.14147949, + "step": 4140, + "time_per_iteration": 2.6166224479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067987, + "balance_loss_mlp": 1.05282402, + "epoch": 0.796652558676414, + "flos": 537650016768.0, + "grad_norm": 0.07641249861388391, + "language_loss": 0.8031469, + "learning_rate": 0.00010457522532673835, + "loss": 0.8138268, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.15136719, + "step": 4141, + "time_per_iteration": 2.8061392307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073775, + "balance_loss_mlp": 1.05888581, + "epoch": 0.7968449403616775, + "flos": 475091495424.0, + "grad_norm": 0.082895122944158, + "language_loss": 0.8321951, + "learning_rate": 0.00010438463489777272, + "loss": 0.84293288, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.14892578, + "step": 4142, + "time_per_iteration": 2.563521385192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066066, + "balance_loss_mlp": 1.05122459, + "epoch": 0.7970373220469411, + "flos": 567613827072.0, + "grad_norm": 0.07215110676242628, + "language_loss": 0.77859384, + "learning_rate": 0.00010419419806031316, + "loss": 0.78925455, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.14807129, + "step": 4143, + "time_per_iteration": 2.692662000656128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075522, + "balance_loss_mlp": 1.06125236, + "epoch": 0.7972297037322047, + "flos": 556208446464.0, + "grad_norm": 0.1076253909846465, + "language_loss": 0.83906108, + "learning_rate": 0.00010400391488829403, + "loss": 0.84981632, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.1427002, + "step": 4144, + "time_per_iteration": 2.853351593017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107537, + "balance_loss_mlp": 1.06045663, + "epoch": 0.7974220854174683, + "flos": 576180158976.0, + "grad_norm": 0.14917315056572417, + "language_loss": 0.86392915, + "learning_rate": 0.00010381378545558984, + "loss": 0.87468284, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.14892578, + "step": 4145, + "time_per_iteration": 2.7161378860473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070419, + "balance_loss_mlp": 1.05575621, + "epoch": 0.7976144671027319, + "flos": 483069754368.0, + "grad_norm": 0.08038510657602778, + "language_loss": 0.8457576, + "learning_rate": 0.00010362380983601505, + "loss": 0.85646176, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.14648438, + "step": 4146, + "time_per_iteration": 2.5544986724853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068569, + "balance_loss_mlp": 1.05391836, + "epoch": 0.7978068487879953, + "flos": 1077865615872.0, + "grad_norm": 0.05616342644239884, + "language_loss": 0.78731227, + "learning_rate": 0.00010343398810332477, + "loss": 0.79799801, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.1463623, + "step": 4147, + "time_per_iteration": 3.4725289344787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070017, + "balance_loss_mlp": 1.05533075, + "epoch": 0.7979992304732589, + "flos": 733739586048.0, + "grad_norm": 0.07604084389723553, + "language_loss": 0.84285581, + "learning_rate": 0.00010324432033121467, + "loss": 0.85355598, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.14672852, + "step": 4148, + "time_per_iteration": 2.925584554672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072951, + "balance_loss_mlp": 1.05818057, + "epoch": 0.7981916121585225, + "flos": 415774342656.0, + "grad_norm": 0.07506198760098327, + "language_loss": 0.83406597, + "learning_rate": 0.00010305480659332005, + "loss": 0.84479547, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.14746094, + "step": 4149, + "time_per_iteration": 2.6363680362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073859, + "balance_loss_mlp": 1.05917299, + "epoch": 0.7983839938437861, + "flos": 465257613312.0, + "grad_norm": 0.07209752388462913, + "language_loss": 0.83577174, + "learning_rate": 0.00010286544696321682, + "loss": 0.84651035, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.14685059, + "step": 4150, + "time_per_iteration": 2.5717275142669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071445, + "balance_loss_mlp": 1.05673504, + "epoch": 0.7985763755290496, + "flos": 510567473664.0, + "grad_norm": 0.08223257276108414, + "language_loss": 0.79523313, + "learning_rate": 0.00010267624151442073, + "loss": 0.80594754, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.14685059, + "step": 4151, + "time_per_iteration": 2.6743481159210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069723, + "balance_loss_mlp": 1.05467856, + "epoch": 0.7987687572143132, + "flos": 1010649498624.0, + "grad_norm": 0.0703143889745847, + "language_loss": 0.80934834, + "learning_rate": 0.000102487190320388, + "loss": 0.82004559, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.15014648, + "step": 4152, + "time_per_iteration": 3.3981220722198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_mlp": 1.05272949, + "epoch": 0.7989611388995768, + "flos": 1021078794240.0, + "grad_norm": 0.32574544217784795, + "language_loss": 0.79635817, + "learning_rate": 0.00010229829345451475, + "loss": 0.80703378, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.14819336, + "step": 4153, + "time_per_iteration": 3.3228917121887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074745, + "balance_loss_mlp": 1.06002223, + "epoch": 0.7991535205848403, + "flos": 1101338601984.0, + "grad_norm": 0.06282548479751149, + "language_loss": 0.79764807, + "learning_rate": 0.00010210955099013724, + "loss": 0.8083955, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.14709473, + "step": 4154, + "time_per_iteration": 3.412867784500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070719, + "balance_loss_mlp": 1.05562687, + "epoch": 0.7993459022701039, + "flos": 834818337792.0, + "grad_norm": 0.0818211478301838, + "language_loss": 0.76729739, + "learning_rate": 0.00010192096300053167, + "loss": 0.77800465, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.15063477, + "step": 4155, + "time_per_iteration": 3.071514368057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071934, + "balance_loss_mlp": 1.05697358, + "epoch": 0.7995382839553674, + "flos": 522686836224.0, + "grad_norm": 0.06336335817254321, + "language_loss": 0.85153681, + "learning_rate": 0.00010173252955891477, + "loss": 0.86225611, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.14941406, + "step": 4156, + "time_per_iteration": 2.8336803913116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078348, + "balance_loss_mlp": 1.06368518, + "epoch": 0.799730665640631, + "flos": 537820715520.0, + "grad_norm": 0.07241348777253756, + "language_loss": 0.73074377, + "learning_rate": 0.00010154425073844253, + "loss": 0.74152726, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.1463623, + "step": 4157, + "time_per_iteration": 2.708444356918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071825, + "balance_loss_mlp": 1.05725741, + "epoch": 0.7999230473258946, + "flos": 505060075008.0, + "grad_norm": 0.05965313173183175, + "language_loss": 0.82319427, + "learning_rate": 0.00010135612661221138, + "loss": 0.83391249, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.14562988, + "step": 4158, + "time_per_iteration": 2.5790717601776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074651, + "balance_loss_mlp": 1.06004786, + "epoch": 0.8001154290111582, + "flos": 1027342393344.0, + "grad_norm": 0.07976593337081749, + "language_loss": 0.81996578, + "learning_rate": 0.00010116815725325751, + "loss": 0.83071226, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.14587402, + "step": 4159, + "time_per_iteration": 3.352048635482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.0596205, + "epoch": 0.8003078106964217, + "flos": 750906754560.0, + "grad_norm": 0.07121421414311549, + "language_loss": 0.80415642, + "learning_rate": 0.00010098034273455725, + "loss": 0.81489933, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.1463623, + "step": 4160, + "time_per_iteration": 2.9569175243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071179, + "balance_loss_mlp": 1.05632544, + "epoch": 0.8005001923816852, + "flos": 488465925120.0, + "grad_norm": 0.06668806534008753, + "language_loss": 0.79674023, + "learning_rate": 0.00010079268312902662, + "loss": 0.80745208, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.14831543, + "step": 4161, + "time_per_iteration": 2.6834394931793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075834, + "balance_loss_mlp": 1.06082582, + "epoch": 0.8006925740669488, + "flos": 513248306688.0, + "grad_norm": 0.08968184454312078, + "language_loss": 0.81960094, + "learning_rate": 0.0001006051785095215, + "loss": 0.83035922, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.14978027, + "step": 4162, + "time_per_iteration": 2.737863779067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073874, + "balance_loss_mlp": 1.05921173, + "epoch": 0.8008849557522124, + "flos": 578529879552.0, + "grad_norm": 0.09340596389529475, + "language_loss": 0.79312497, + "learning_rate": 0.0001004178289488376, + "loss": 0.8038637, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.14672852, + "step": 4163, + "time_per_iteration": 2.7409329414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074352, + "balance_loss_mlp": 1.05973649, + "epoch": 0.801077337437476, + "flos": 478708766208.0, + "grad_norm": 0.07216515601811406, + "language_loss": 0.84130692, + "learning_rate": 0.0001002306345197106, + "loss": 0.85205042, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.14599609, + "step": 4164, + "time_per_iteration": 2.537263870239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078084, + "balance_loss_mlp": 1.06349313, + "epoch": 0.8012697191227395, + "flos": 676700573184.0, + "grad_norm": 0.07365299620590934, + "language_loss": 0.80348939, + "learning_rate": 0.00010004359529481571, + "loss": 0.81427026, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.14575195, + "step": 4165, + "time_per_iteration": 2.8671815395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078878, + "balance_loss_mlp": 1.0639292, + "epoch": 0.8014621008080031, + "flos": 1295132405760.0, + "grad_norm": 0.08098076628944305, + "language_loss": 0.82058138, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83137012, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.14941406, + "step": 4166, + "time_per_iteration": 3.699275255203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076335, + "balance_loss_mlp": 1.06169629, + "epoch": 0.8016544824932667, + "flos": 511827683328.0, + "grad_norm": 0.0841721873236777, + "language_loss": 0.82996416, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84072757, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.14611816, + "step": 4167, + "time_per_iteration": 2.6094071865081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078702, + "balance_loss_mlp": 1.06408715, + "epoch": 0.8018468641785302, + "flos": 535690879488.0, + "grad_norm": 0.09563124315006066, + "language_loss": 0.80843663, + "learning_rate": 9.948340957137308e-05, + "loss": 0.8192237, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.14611816, + "step": 4168, + "time_per_iteration": 2.6237661838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_mlp": 1.0646534, + "epoch": 0.8020392458637937, + "flos": 1023431086080.0, + "grad_norm": 0.0771033219349132, + "language_loss": 0.79519576, + "learning_rate": 9.929699188895447e-05, + "loss": 0.8059904, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.14794922, + "step": 4169, + "time_per_iteration": 3.28833270072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_mlp": 1.02049804, + "epoch": 0.8022316275490573, + "flos": 1561806821376.0, + "grad_norm": 0.022525572886173285, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79081434, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.06542969, + "step": 4170, + "time_per_iteration": 4.9581146240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079183, + "balance_loss_mlp": 1.06417489, + "epoch": 0.8024240092343209, + "flos": 420698810880.0, + "grad_norm": 0.08083789363568177, + "language_loss": 0.83295381, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84374571, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.14990234, + "step": 4171, + "time_per_iteration": 2.5198962688446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107979, + "balance_loss_mlp": 1.0656513, + "epoch": 0.8026163909195845, + "flos": 763836645888.0, + "grad_norm": 0.0840583068148426, + "language_loss": 0.7862519, + "learning_rate": 9.873867253111762e-05, + "loss": 0.79704976, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.14147949, + "step": 4172, + "time_per_iteration": 2.9434571266174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020525, + "balance_loss_mlp": 1.01408792, + "epoch": 0.8028087726048481, + "flos": 1518861362688.0, + "grad_norm": 0.0182097422778206, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81285089, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.06445312, + "step": 4173, + "time_per_iteration": 4.941962718963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076678, + "balance_loss_mlp": 1.06175327, + "epoch": 0.8030011542901115, + "flos": 517861486080.0, + "grad_norm": 0.16472070475326986, + "language_loss": 0.88381541, + "learning_rate": 9.836723842278733e-05, + "loss": 0.89458215, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.14892578, + "step": 4174, + "time_per_iteration": 2.6340060234069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079926, + "balance_loss_mlp": 1.0656451, + "epoch": 0.8031935359753751, + "flos": 545616165888.0, + "grad_norm": 0.07241242292177963, + "language_loss": 0.78390783, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79470706, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.1427002, + "step": 4175, + "time_per_iteration": 2.7003095149993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075656, + "balance_loss_mlp": 1.06123185, + "epoch": 0.8033859176606387, + "flos": 603559309824.0, + "grad_norm": 0.08397414086825541, + "language_loss": 0.84535128, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85610783, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.14416504, + "step": 4176, + "time_per_iteration": 2.842618227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071865, + "balance_loss_mlp": 1.05723834, + "epoch": 0.8035782993459023, + "flos": 565859520000.0, + "grad_norm": 0.08177365403070841, + "language_loss": 0.81297785, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82369649, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.14624023, + "step": 4177, + "time_per_iteration": 2.7389862537384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077895, + "balance_loss_mlp": 1.06313717, + "epoch": 0.8037706810311658, + "flos": 538435952640.0, + "grad_norm": 0.07713213601435066, + "language_loss": 0.84999192, + "learning_rate": 9.762624191379054e-05, + "loss": 0.86077082, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.14733887, + "step": 4178, + "time_per_iteration": 2.6558520793914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070602, + "balance_loss_mlp": 1.05615425, + "epoch": 0.8039630627164294, + "flos": 515187993600.0, + "grad_norm": 0.07194102205057808, + "language_loss": 0.79348469, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80419075, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.14428711, + "step": 4179, + "time_per_iteration": 2.6974868774414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013319, + "balance_loss_mlp": 1.00692964, + "epoch": 0.804155444401693, + "flos": 1478834247168.0, + "grad_norm": 0.009099192400520165, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75746888, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.06396484, + "step": 4180, + "time_per_iteration": 4.910180330276489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078163, + "balance_loss_mlp": 1.06361914, + "epoch": 0.8043478260869565, + "flos": 521164896768.0, + "grad_norm": 0.06460867004727015, + "language_loss": 0.76895148, + "learning_rate": 9.707213454125396e-05, + "loss": 0.77973306, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.14538574, + "step": 4181, + "time_per_iteration": 2.673661470413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070028, + "balance_loss_mlp": 1.05518675, + "epoch": 0.8045402077722201, + "flos": 545448038400.0, + "grad_norm": 0.06289883522471808, + "language_loss": 0.80526221, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81596249, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.14819336, + "step": 4182, + "time_per_iteration": 2.8102376461029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107591, + "balance_loss_mlp": 1.06159282, + "epoch": 0.8047325894574836, + "flos": 678388068864.0, + "grad_norm": 0.06393173875827637, + "language_loss": 0.74231839, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75307751, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.14306641, + "step": 4183, + "time_per_iteration": 2.9303932189941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107065, + "balance_loss_mlp": 1.05599904, + "epoch": 0.8049249711427472, + "flos": 587227262976.0, + "grad_norm": 0.09136907696197756, + "language_loss": 0.78323948, + "learning_rate": 9.65194350425882e-05, + "loss": 0.79394597, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.1463623, + "step": 4184, + "time_per_iteration": 2.787539005279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069011, + "balance_loss_mlp": 1.05470586, + "epoch": 0.8051173528280108, + "flos": 814194312192.0, + "grad_norm": 0.08523258631943265, + "language_loss": 0.77739137, + "learning_rate": 9.633551507115452e-05, + "loss": 0.78808153, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.14306641, + "step": 4185, + "time_per_iteration": 3.130908727645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075899, + "balance_loss_mlp": 1.06085443, + "epoch": 0.8053097345132744, + "flos": 725687175168.0, + "grad_norm": 0.12627970759044813, + "language_loss": 0.77332032, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78407931, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.15026855, + "step": 4186, + "time_per_iteration": 2.9669125080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_mlp": 1.06421077, + "epoch": 0.805502116198538, + "flos": 748050453504.0, + "grad_norm": 0.0799806090831211, + "language_loss": 0.81470084, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82548958, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.14648438, + "step": 4187, + "time_per_iteration": 3.013604164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072205, + "balance_loss_mlp": 1.05763793, + "epoch": 0.8056944978838014, + "flos": 640258421760.0, + "grad_norm": 0.06438173450385795, + "language_loss": 0.87444198, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88516408, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.14562988, + "step": 4188, + "time_per_iteration": 2.8994572162628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072497, + "balance_loss_mlp": 1.05785775, + "epoch": 0.805886879569065, + "flos": 644631892992.0, + "grad_norm": 0.08726456548799634, + "language_loss": 0.78306341, + "learning_rate": 9.560140306306436e-05, + "loss": 0.79378831, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.14624023, + "step": 4189, + "time_per_iteration": 2.7558131217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076011, + "balance_loss_mlp": 1.06158686, + "epoch": 0.8060792612543286, + "flos": 661230812160.0, + "grad_norm": 0.07215370646866548, + "language_loss": 0.81434023, + "learning_rate": 9.541826738671233e-05, + "loss": 0.8251003, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.14404297, + "step": 4190, + "time_per_iteration": 2.8377161026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073247, + "balance_loss_mlp": 1.05854881, + "epoch": 0.8062716429395922, + "flos": 455075366400.0, + "grad_norm": 0.08365860957548234, + "language_loss": 0.8272016, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83793408, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.14697266, + "step": 4191, + "time_per_iteration": 2.5463461875915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073967, + "balance_loss_mlp": 1.05886352, + "epoch": 0.8064640246248557, + "flos": 526407994368.0, + "grad_norm": 0.08656547672961308, + "language_loss": 0.85237193, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86311156, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.15087891, + "step": 4192, + "time_per_iteration": 2.628451108932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074197, + "balance_loss_mlp": 1.05997539, + "epoch": 0.8066564063101193, + "flos": 865115458560.0, + "grad_norm": 0.060557734767924705, + "language_loss": 0.81796318, + "learning_rate": 9.486980307710208e-05, + "loss": 0.82870519, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.14233398, + "step": 4193, + "time_per_iteration": 3.221529960632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073516, + "balance_loss_mlp": 1.05888867, + "epoch": 0.8068487879953828, + "flos": 530536614912.0, + "grad_norm": 0.06679701242235103, + "language_loss": 0.81742352, + "learning_rate": 9.468729611697246e-05, + "loss": 0.82815868, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.14599609, + "step": 4194, + "time_per_iteration": 2.7180535793304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107158, + "balance_loss_mlp": 1.05689359, + "epoch": 0.8070411696806464, + "flos": 566183291904.0, + "grad_norm": 0.06755378291949884, + "language_loss": 0.81656551, + "learning_rate": 9.450494651319003e-05, + "loss": 0.8272813, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.14672852, + "step": 4195, + "time_per_iteration": 2.661775827407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072945, + "balance_loss_mlp": 1.05812728, + "epoch": 0.80723355136591, + "flos": 986591010816.0, + "grad_norm": 0.05699418156609254, + "language_loss": 0.79166675, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80239624, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.14794922, + "step": 4196, + "time_per_iteration": 3.3003180027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107439, + "balance_loss_mlp": 1.05973983, + "epoch": 0.8074259330511735, + "flos": 566961513984.0, + "grad_norm": 0.0689415903823296, + "language_loss": 0.82874274, + "learning_rate": 9.414071965778221e-05, + "loss": 0.83948666, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.1463623, + "step": 4197, + "time_per_iteration": 2.79154896736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077001, + "balance_loss_mlp": 1.0621475, + "epoch": 0.8076183147364371, + "flos": 494662712832.0, + "grad_norm": 0.06697307053985302, + "language_loss": 0.79652965, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80729973, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.14831543, + "step": 4198, + "time_per_iteration": 2.7206079959869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074545, + "balance_loss_mlp": 1.06003702, + "epoch": 0.8078106964217007, + "flos": 420011993088.0, + "grad_norm": 0.0821513988093656, + "language_loss": 0.79780805, + "learning_rate": 9.377712307650044e-05, + "loss": 0.80855346, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.14489746, + "step": 4199, + "time_per_iteration": 2.510125160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074169, + "balance_loss_mlp": 1.05935168, + "epoch": 0.8080030781069643, + "flos": 527537152512.0, + "grad_norm": 0.07168048357507804, + "language_loss": 0.83144093, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84218264, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.14794922, + "step": 4200, + "time_per_iteration": 2.659519910812378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107465, + "balance_loss_mlp": 1.05992758, + "epoch": 0.8081954597922277, + "flos": 544148554752.0, + "grad_norm": 0.061739081334624905, + "language_loss": 0.81328112, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82402754, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.14697266, + "step": 4201, + "time_per_iteration": 2.641256809234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073727, + "balance_loss_mlp": 1.0594337, + "epoch": 0.8083878414774913, + "flos": 640900823040.0, + "grad_norm": 0.07419172018903049, + "language_loss": 0.75228035, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76301754, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.14294434, + "step": 4202, + "time_per_iteration": 2.8421621322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077714, + "balance_loss_mlp": 1.06293166, + "epoch": 0.8085802231627549, + "flos": 705614146560.0, + "grad_norm": 0.0834576005577422, + "language_loss": 0.72859406, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73937118, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.14746094, + "step": 4203, + "time_per_iteration": 2.9009647369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068048, + "balance_loss_mlp": 1.05312276, + "epoch": 0.8087726048480185, + "flos": 419762373120.0, + "grad_norm": 0.07762289218582992, + "language_loss": 0.88771188, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89839238, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.14904785, + "step": 4204, + "time_per_iteration": 2.6274211406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075402, + "balance_loss_mlp": 1.06070352, + "epoch": 0.8089649865332821, + "flos": 508766178816.0, + "grad_norm": 0.07618621801756342, + "language_loss": 0.87048995, + "learning_rate": 9.269012061893922e-05, + "loss": 0.881244, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.14697266, + "step": 4205, + "time_per_iteration": 2.7980542182922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107201, + "balance_loss_mlp": 1.05760992, + "epoch": 0.8091573682185456, + "flos": 457219883520.0, + "grad_norm": 0.06817145148860111, + "language_loss": 0.85155141, + "learning_rate": 9.250950659394386e-05, + "loss": 0.86227149, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.14404297, + "step": 4206, + "time_per_iteration": 2.7548696994781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068706, + "balance_loss_mlp": 1.05355477, + "epoch": 0.8093497499038091, + "flos": 525256441344.0, + "grad_norm": 0.07651954688486194, + "language_loss": 0.7713989, + "learning_rate": 9.232905077078824e-05, + "loss": 0.78208601, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.15124512, + "step": 4207, + "time_per_iteration": 2.7961602210998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078891, + "balance_loss_mlp": 1.06387043, + "epoch": 0.8095421315890727, + "flos": 489617478144.0, + "grad_norm": 0.07872605928187458, + "language_loss": 0.76999003, + "learning_rate": 9.214875321953164e-05, + "loss": 0.78077894, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.15002441, + "step": 4208, + "time_per_iteration": 2.5866055488586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067244, + "balance_loss_mlp": 1.05254602, + "epoch": 0.8097345132743363, + "flos": 625109861376.0, + "grad_norm": 0.06523356997123914, + "language_loss": 0.8081665, + "learning_rate": 9.196861401017164e-05, + "loss": 0.81883889, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.14685059, + "step": 4209, + "time_per_iteration": 2.789491653442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075999, + "balance_loss_mlp": 1.06115699, + "epoch": 0.8099268949595998, + "flos": 615688584192.0, + "grad_norm": 0.06679247683416532, + "language_loss": 0.79083157, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80159163, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.14819336, + "step": 4210, + "time_per_iteration": 2.80202579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072328, + "balance_loss_mlp": 1.05728388, + "epoch": 0.8101192766448634, + "flos": 479642632704.0, + "grad_norm": 0.08620954962074664, + "language_loss": 0.79814863, + "learning_rate": 9.160881089682566e-05, + "loss": 0.80887187, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.15026855, + "step": 4211, + "time_per_iteration": 2.657390594482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072652, + "balance_loss_mlp": 1.05794144, + "epoch": 0.810311658330127, + "flos": 517327741440.0, + "grad_norm": 0.06333891813293195, + "language_loss": 0.86381185, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87453836, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.14697266, + "step": 4212, + "time_per_iteration": 2.6212716102600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071222, + "balance_loss_mlp": 1.0565474, + "epoch": 0.8105040400153906, + "flos": 575782235136.0, + "grad_norm": 0.05862858167541506, + "language_loss": 0.84100783, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85172009, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.1463623, + "step": 4213, + "time_per_iteration": 2.822678804397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069022, + "balance_loss_mlp": 1.05432391, + "epoch": 0.8106964217006541, + "flos": 638963707392.0, + "grad_norm": 0.07655774671852761, + "language_loss": 0.85175037, + "learning_rate": 9.107029553743862e-05, + "loss": 0.86244059, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.14685059, + "step": 4214, + "time_per_iteration": 2.8445212841033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071466, + "balance_loss_mlp": 1.05682731, + "epoch": 0.8108888033859176, + "flos": 579505964544.0, + "grad_norm": 0.1237093586633983, + "language_loss": 0.81737274, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82808745, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.14611816, + "step": 4215, + "time_per_iteration": 2.733858585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068919, + "balance_loss_mlp": 1.05443513, + "epoch": 0.8110811850711812, + "flos": 559907209728.0, + "grad_norm": 0.07043550712901828, + "language_loss": 0.83526266, + "learning_rate": 9.071207898465284e-05, + "loss": 0.84595191, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.14477539, + "step": 4216, + "time_per_iteration": 2.795978546142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010059, + "balance_loss_mlp": 1.00371671, + "epoch": 0.8112735667564448, + "flos": 1517939979264.0, + "grad_norm": 0.007733492761232115, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78270477, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.06347656, + "step": 4217, + "time_per_iteration": 4.671598672866821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073174, + "balance_loss_mlp": 1.0584631, + "epoch": 0.8114659484417084, + "flos": 616340897280.0, + "grad_norm": 0.09308711291624655, + "language_loss": 0.850631, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86136276, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.14697266, + "step": 4218, + "time_per_iteration": 2.8252713680267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070586, + "balance_loss_mlp": 1.05599451, + "epoch": 0.8116583301269719, + "flos": 649951340544.0, + "grad_norm": 0.06544053347412945, + "language_loss": 0.79116189, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80186772, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.14562988, + "step": 4219, + "time_per_iteration": 3.0103390216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072332, + "balance_loss_mlp": 1.05731189, + "epoch": 0.8118507118122354, + "flos": 553087844352.0, + "grad_norm": 0.06754060213637747, + "language_loss": 0.80264437, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81336772, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.14990234, + "step": 4220, + "time_per_iteration": 2.7641568183898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067935, + "balance_loss_mlp": 1.05295074, + "epoch": 0.812043093497499, + "flos": 544118819328.0, + "grad_norm": 0.08257930286833466, + "language_loss": 0.8756063, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88628566, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.14953613, + "step": 4221, + "time_per_iteration": 2.646381139755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107214, + "balance_loss_mlp": 1.05748951, + "epoch": 0.8122354751827626, + "flos": 583404788736.0, + "grad_norm": 0.06076540452447546, + "language_loss": 0.83228678, + "learning_rate": 8.964124513805628e-05, + "loss": 0.84300816, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.14624023, + "step": 4222, + "time_per_iteration": 2.7860500812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011046, + "balance_loss_mlp": 1.00465596, + "epoch": 0.8124278568680262, + "flos": 1530568120320.0, + "grad_norm": 0.007608284192397786, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79261118, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.06396484, + "step": 4223, + "time_per_iteration": 4.9178102016448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073494, + "balance_loss_mlp": 1.05906975, + "epoch": 0.8126202385532897, + "flos": 432865161216.0, + "grad_norm": 0.07270938351246994, + "language_loss": 0.79917443, + "learning_rate": 8.928557430748668e-05, + "loss": 0.80990934, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.14404297, + "step": 4224, + "time_per_iteration": 2.583998680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010746, + "balance_loss_mlp": 1.00440443, + "epoch": 0.8128126202385533, + "flos": 1547905987584.0, + "grad_norm": 0.00790710761891799, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77506375, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.06347656, + "step": 4225, + "time_per_iteration": 4.820707321166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070833, + "balance_loss_mlp": 1.05620575, + "epoch": 0.8130050019238169, + "flos": 528317945856.0, + "grad_norm": 0.06548779775423773, + "language_loss": 0.88866699, + "learning_rate": 8.893054129078077e-05, + "loss": 0.89937526, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.14624023, + "step": 4226, + "time_per_iteration": 2.636085271835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071555, + "balance_loss_mlp": 1.05685687, + "epoch": 0.8131973836090804, + "flos": 543125481984.0, + "grad_norm": 0.08255855084993005, + "language_loss": 0.80108345, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81179905, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.14685059, + "step": 4227, + "time_per_iteration": 2.8090357780456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107379, + "balance_loss_mlp": 1.05867422, + "epoch": 0.8133897652943439, + "flos": 576494019072.0, + "grad_norm": 0.08767577384778778, + "language_loss": 0.82186741, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83260536, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.15087891, + "step": 4228, + "time_per_iteration": 2.7402915954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078602, + "balance_loss_mlp": 1.06378388, + "epoch": 0.8135821469796075, + "flos": 579219268608.0, + "grad_norm": 0.07685929665227552, + "language_loss": 0.78881317, + "learning_rate": 8.839918887251025e-05, + "loss": 0.79959923, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.14794922, + "step": 4229, + "time_per_iteration": 2.7446353435516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076808, + "balance_loss_mlp": 1.06232381, + "epoch": 0.8137745286648711, + "flos": 650346693120.0, + "grad_norm": 0.0759740537833267, + "language_loss": 0.83667004, + "learning_rate": 8.822239090334472e-05, + "loss": 0.8474381, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.14465332, + "step": 4230, + "time_per_iteration": 2.9547126293182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107277, + "balance_loss_mlp": 1.05783296, + "epoch": 0.8139669103501347, + "flos": 701888219136.0, + "grad_norm": 0.06626400468200025, + "language_loss": 0.7554509, + "learning_rate": 8.804575280042493e-05, + "loss": 0.76617861, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.14929199, + "step": 4231, + "time_per_iteration": 2.974144458770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080812, + "balance_loss_mlp": 1.06619692, + "epoch": 0.8141592920353983, + "flos": 650223355392.0, + "grad_norm": 0.08117031851913392, + "language_loss": 0.82810342, + "learning_rate": 8.786927463232774e-05, + "loss": 0.83891159, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.14587402, + "step": 4232, + "time_per_iteration": 2.828878164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078383, + "balance_loss_mlp": 1.06356478, + "epoch": 0.8143516737206618, + "flos": 536829949440.0, + "grad_norm": 0.07623472218938802, + "language_loss": 0.81033397, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82111776, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.14794922, + "step": 4233, + "time_per_iteration": 2.61362361907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076653, + "balance_loss_mlp": 1.06164443, + "epoch": 0.8145440554059253, + "flos": 508366056960.0, + "grad_norm": 0.08266771848864475, + "language_loss": 0.82275444, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83352101, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.14978027, + "step": 4234, + "time_per_iteration": 2.5858421325683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071303, + "balance_loss_mlp": 1.05697441, + "epoch": 0.8147364370911889, + "flos": 635032576512.0, + "grad_norm": 0.05785121947375422, + "language_loss": 0.86312371, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87383676, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.14318848, + "step": 4235, + "time_per_iteration": 2.841019868850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072756, + "balance_loss_mlp": 1.05809283, + "epoch": 0.8149288187764525, + "flos": 422801482752.0, + "grad_norm": 0.07694022174465051, + "language_loss": 0.78536922, + "learning_rate": 8.716496267753343e-05, + "loss": 0.7960968, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.14660645, + "step": 4236, + "time_per_iteration": 2.4641432762145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107039, + "balance_loss_mlp": 1.055632, + "epoch": 0.8151212004617161, + "flos": 597444014592.0, + "grad_norm": 0.07150966295546053, + "language_loss": 0.81421232, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82491624, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.14733887, + "step": 4237, + "time_per_iteration": 2.7782487869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011063, + "balance_loss_mlp": 1.00476873, + "epoch": 0.8153135821469796, + "flos": 1479330915840.0, + "grad_norm": 0.006323287635293764, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78863907, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.06298828, + "step": 4238, + "time_per_iteration": 4.983094930648804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067411, + "balance_loss_mlp": 1.0524863, + "epoch": 0.8155059638322432, + "flos": 437097669120.0, + "grad_norm": 0.10825435127351188, + "language_loss": 0.82812446, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83879864, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.14916992, + "step": 4239, + "time_per_iteration": 2.5248992443084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107273, + "balance_loss_mlp": 1.05785286, + "epoch": 0.8156983455175068, + "flos": 794390727168.0, + "grad_norm": 0.07546845306981396, + "language_loss": 0.85244554, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86317283, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.14855957, + "step": 4240, + "time_per_iteration": 3.0435335636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069391, + "balance_loss_mlp": 1.05460918, + "epoch": 0.8158907272027703, + "flos": 685986029568.0, + "grad_norm": 0.09379307453363464, + "language_loss": 0.81874454, + "learning_rate": 8.628817947092616e-05, + "loss": 0.82943839, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.14758301, + "step": 4241, + "time_per_iteration": 2.8032925128936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069769, + "balance_loss_mlp": 1.05539286, + "epoch": 0.8160831088880338, + "flos": 487055213568.0, + "grad_norm": 0.08597604805020649, + "language_loss": 0.84047925, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85117698, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.14367676, + "step": 4242, + "time_per_iteration": 2.5600948333740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063929, + "balance_loss_mlp": 1.04928982, + "epoch": 0.8162754905732974, + "flos": 464872172544.0, + "grad_norm": 0.06850617145675146, + "language_loss": 0.80506492, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81570411, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.14611816, + "step": 4243, + "time_per_iteration": 2.554950475692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012818, + "balance_loss_mlp": 1.00652385, + "epoch": 0.816467872258561, + "flos": 1239530522112.0, + "grad_norm": 0.007477552534397375, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76297939, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.06298828, + "step": 4244, + "time_per_iteration": 4.717959880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063982, + "balance_loss_mlp": 1.04943848, + "epoch": 0.8166602539438246, + "flos": 687169516032.0, + "grad_norm": 0.058268983296576836, + "language_loss": 0.86534429, + "learning_rate": 8.558964360534615e-05, + "loss": 0.87598407, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.14526367, + "step": 4245, + "time_per_iteration": 2.9267804622650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011178, + "balance_loss_mlp": 1.00488424, + "epoch": 0.8168526356290882, + "flos": 1490520807936.0, + "grad_norm": 0.00711959110755669, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.73985922, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.06298828, + "step": 4246, + "time_per_iteration": 4.947716951370239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066889, + "balance_loss_mlp": 1.05230975, + "epoch": 0.8170450173143516, + "flos": 578201338368.0, + "grad_norm": 0.07220804796872216, + "language_loss": 0.8435545, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85422337, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.14575195, + "step": 4247, + "time_per_iteration": 2.71348237991333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070044, + "balance_loss_mlp": 1.0554409, + "epoch": 0.8172373989996152, + "flos": 571275514368.0, + "grad_norm": 0.06401767096743954, + "language_loss": 0.84267342, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85337389, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.14599609, + "step": 4248, + "time_per_iteration": 2.7759175300598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.04941392, + "epoch": 0.8174297806848788, + "flos": 528831866880.0, + "grad_norm": 0.08441006927366383, + "language_loss": 0.81059384, + "learning_rate": 8.489368195241948e-05, + "loss": 0.82123899, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.15075684, + "step": 4249, + "time_per_iteration": 2.687244176864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106512, + "balance_loss_mlp": 1.05042124, + "epoch": 0.8176221623701424, + "flos": 569108602368.0, + "grad_norm": 0.06785328492638941, + "language_loss": 0.78955877, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80021, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.14697266, + "step": 4250, + "time_per_iteration": 2.829185724258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066489, + "balance_loss_mlp": 1.05181456, + "epoch": 0.8178145440554059, + "flos": 656521459200.0, + "grad_norm": 0.07524856848543239, + "language_loss": 0.80325913, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81392401, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.14660645, + "step": 4251, + "time_per_iteration": 2.8891162872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067615, + "balance_loss_mlp": 1.05297589, + "epoch": 0.8180069257406695, + "flos": 545924883456.0, + "grad_norm": 0.07928844694004242, + "language_loss": 0.87725914, + "learning_rate": 8.437340264101828e-05, + "loss": 0.88793522, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.14611816, + "step": 4252, + "time_per_iteration": 2.7597384452819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067221, + "balance_loss_mlp": 1.0522964, + "epoch": 0.818199307425933, + "flos": 619271350272.0, + "grad_norm": 0.08227515131076636, + "language_loss": 0.84713292, + "learning_rate": 8.420029883528474e-05, + "loss": 0.85780513, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.14904785, + "step": 4253, + "time_per_iteration": 2.727544069290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068369, + "balance_loss_mlp": 1.05302691, + "epoch": 0.8183916891111966, + "flos": 647618872320.0, + "grad_norm": 0.08297238851209021, + "language_loss": 0.76718354, + "learning_rate": 8.402735645731157e-05, + "loss": 0.77786726, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.15319824, + "step": 4254, + "time_per_iteration": 2.9058609008789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066395, + "balance_loss_mlp": 1.05169678, + "epoch": 0.8185840707964602, + "flos": 499120247808.0, + "grad_norm": 0.07214603685273151, + "language_loss": 0.77970219, + "learning_rate": 8.385457557424098e-05, + "loss": 0.79036617, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.14685059, + "step": 4255, + "time_per_iteration": 2.618168830871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106315, + "balance_loss_mlp": 1.04854703, + "epoch": 0.8187764524817237, + "flos": 786229659648.0, + "grad_norm": 0.06559935606493841, + "language_loss": 0.79293621, + "learning_rate": 8.368195625315251e-05, + "loss": 0.80356765, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.14599609, + "step": 4256, + "time_per_iteration": 3.2203421592712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_mlp": 1.04709792, + "epoch": 0.8189688341669873, + "flos": 550710959616.0, + "grad_norm": 0.05824841247064268, + "language_loss": 0.80574787, + "learning_rate": 8.350949856106283e-05, + "loss": 0.81636846, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.14929199, + "step": 4257, + "time_per_iteration": 2.925502300262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008141, + "balance_loss_mlp": 1.00179935, + "epoch": 0.8191612158522509, + "flos": 1351972435968.0, + "grad_norm": 0.005216238757074485, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72157484, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.06347656, + "step": 4258, + "time_per_iteration": 4.837713241577148 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060434, + "balance_loss_mlp": 1.04580665, + "epoch": 0.8193535975375145, + "flos": 544257211392.0, + "grad_norm": 0.08204220791726961, + "language_loss": 0.83521521, + "learning_rate": 8.316506833163318e-05, + "loss": 0.84581947, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.14599609, + "step": 4259, + "time_per_iteration": 2.687384605407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067951, + "balance_loss_mlp": 1.05326462, + "epoch": 0.8195459792227779, + "flos": 865733266944.0, + "grad_norm": 0.057213289118123956, + "language_loss": 0.85745478, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86813432, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.14660645, + "step": 4260, + "time_per_iteration": 3.1039042472839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066565, + "balance_loss_mlp": 1.05187869, + "epoch": 0.8197383609080415, + "flos": 569293982208.0, + "grad_norm": 0.08308709136286152, + "language_loss": 0.81558263, + "learning_rate": 8.282128542083101e-05, + "loss": 0.82624829, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.14672852, + "step": 4261, + "time_per_iteration": 2.7021541595458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060256, + "balance_loss_mlp": 1.04535532, + "epoch": 0.8199307425933051, + "flos": 530813399040.0, + "grad_norm": 0.06915941438487261, + "language_loss": 0.85103023, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86163288, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.14892578, + "step": 4262, + "time_per_iteration": 2.6805107593536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067164, + "balance_loss_mlp": 1.05225098, + "epoch": 0.8201231242785687, + "flos": 567070170624.0, + "grad_norm": 0.06623199585661957, + "language_loss": 0.84908283, + "learning_rate": 8.247815036252921e-05, + "loss": 0.85975444, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.14904785, + "step": 4263, + "time_per_iteration": 2.799445629119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064375, + "balance_loss_mlp": 1.05036759, + "epoch": 0.8203155059638323, + "flos": 1230505717248.0, + "grad_norm": 0.06807936964108087, + "language_loss": 0.82982183, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84046555, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.14038086, + "step": 4264, + "time_per_iteration": 3.5467734336853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066376, + "balance_loss_mlp": 1.05158246, + "epoch": 0.8205078876490958, + "flos": 574198626816.0, + "grad_norm": 0.06878665098349063, + "language_loss": 0.79854757, + "learning_rate": 8.213566368959558e-05, + "loss": 0.80921131, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.14770508, + "step": 4265, + "time_per_iteration": 2.6667027473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068665, + "balance_loss_mlp": 1.05419254, + "epoch": 0.8207002693343594, + "flos": 931400280576.0, + "grad_norm": 0.07205474863641972, + "language_loss": 0.77937365, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79006028, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.14465332, + "step": 4266, + "time_per_iteration": 3.2075653076171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068905, + "balance_loss_mlp": 1.05440879, + "epoch": 0.8208926510196229, + "flos": 549571889664.0, + "grad_norm": 0.06576276749924337, + "language_loss": 0.80342031, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81410939, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.14501953, + "step": 4267, + "time_per_iteration": 2.6763927936553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070444, + "balance_loss_mlp": 1.0557574, + "epoch": 0.8210850327048865, + "flos": 648182352384.0, + "grad_norm": 0.058242671998823256, + "language_loss": 0.8210336, + "learning_rate": 8.162315056592918e-05, + "loss": 0.83173811, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.14672852, + "step": 4268, + "time_per_iteration": 2.861537456512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066951, + "balance_loss_mlp": 1.05231237, + "epoch": 0.82127741439015, + "flos": 601520878080.0, + "grad_norm": 0.09144521431172725, + "language_loss": 0.81410992, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82477945, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.14611816, + "step": 4269, + "time_per_iteration": 2.832193613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064735, + "balance_loss_mlp": 1.05038261, + "epoch": 0.8214697960754136, + "flos": 474831963648.0, + "grad_norm": 0.07129897215411395, + "language_loss": 0.83495176, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84559911, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.14355469, + "step": 4270, + "time_per_iteration": 2.7007412910461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070918, + "balance_loss_mlp": 1.05664861, + "epoch": 0.8216621777606772, + "flos": 903648172032.0, + "grad_norm": 0.10714973214650605, + "language_loss": 0.84790981, + "learning_rate": 8.11120992965671e-05, + "loss": 0.85861897, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.1427002, + "step": 4271, + "time_per_iteration": 3.086967945098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070644, + "balance_loss_mlp": 1.05581439, + "epoch": 0.8218545594459408, + "flos": 514461528576.0, + "grad_norm": 0.09850812863840513, + "language_loss": 0.82123983, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83194625, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.14819336, + "step": 4272, + "time_per_iteration": 2.6546895503997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066366, + "balance_loss_mlp": 1.05183411, + "epoch": 0.8220469411312044, + "flos": 494536803840.0, + "grad_norm": 0.08038507923953937, + "language_loss": 0.86432809, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87499177, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.14526367, + "step": 4273, + "time_per_iteration": 2.6141135692596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106927, + "balance_loss_mlp": 1.05478621, + "epoch": 0.8222393228164678, + "flos": 386433483264.0, + "grad_norm": 0.09203044891172038, + "language_loss": 0.8956039, + "learning_rate": 8.060251166717835e-05, + "loss": 0.90629661, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.14477539, + "step": 4274, + "time_per_iteration": 2.462885618209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070861, + "balance_loss_mlp": 1.05579329, + "epoch": 0.8224317045017314, + "flos": 536590241280.0, + "grad_norm": 0.062175194099720756, + "language_loss": 0.86843693, + "learning_rate": 8.043297468527383e-05, + "loss": 0.8791455, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.1505127, + "step": 4275, + "time_per_iteration": 2.687908172607422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066529, + "balance_loss_mlp": 1.05210471, + "epoch": 0.822624086186995, + "flos": 554899051008.0, + "grad_norm": 0.07402291421555263, + "language_loss": 0.82578254, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83644789, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.14416504, + "step": 4276, + "time_per_iteration": 2.653111696243286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063808, + "balance_loss_mlp": 1.04871583, + "epoch": 0.8228164678722586, + "flos": 539579791872.0, + "grad_norm": 0.06446226199945072, + "language_loss": 0.79937363, + "learning_rate": 8.009438945831771e-05, + "loss": 0.81001174, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.15075684, + "step": 4277, + "time_per_iteration": 2.774325132369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067009, + "balance_loss_mlp": 1.05238247, + "epoch": 0.8230088495575221, + "flos": 473253124608.0, + "grad_norm": 0.06473508268718137, + "language_loss": 0.79103273, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80170286, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.14599609, + "step": 4278, + "time_per_iteration": 2.653775930404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066815, + "balance_loss_mlp": 1.0519377, + "epoch": 0.8232012312427857, + "flos": 591672314880.0, + "grad_norm": 0.09210301400627263, + "language_loss": 0.82811761, + "learning_rate": 7.975645631856127e-05, + "loss": 0.83878583, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.14855957, + "step": 4279, + "time_per_iteration": 2.6823325157165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065231, + "balance_loss_mlp": 1.05017543, + "epoch": 0.8233936129280492, + "flos": 572644380672.0, + "grad_norm": 0.06658303905953458, + "language_loss": 0.74463868, + "learning_rate": 7.958773444541916e-05, + "loss": 0.75529099, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.15026855, + "step": 4280, + "time_per_iteration": 2.7801764011383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066907, + "balance_loss_mlp": 1.05257797, + "epoch": 0.8235859946133128, + "flos": 731337735168.0, + "grad_norm": 0.0690373463225978, + "language_loss": 0.78400791, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79467702, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.14343262, + "step": 4281, + "time_per_iteration": 3.060039520263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066701, + "balance_loss_mlp": 1.05222869, + "epoch": 0.8237783762985764, + "flos": 570314483712.0, + "grad_norm": 0.07396215157678351, + "language_loss": 0.81395936, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82462645, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.14453125, + "step": 4282, + "time_per_iteration": 2.808473587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007879, + "balance_loss_mlp": 1.00144207, + "epoch": 0.8239707579838399, + "flos": 1466232897024.0, + "grad_norm": 0.005158334115964225, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76305556, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.06445312, + "step": 4283, + "time_per_iteration": 4.95106315612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065977, + "balance_loss_mlp": 1.05111217, + "epoch": 0.8241631396691035, + "flos": 467313297408.0, + "grad_norm": 0.07378682301841104, + "language_loss": 0.80314898, + "learning_rate": 7.89144797921037e-05, + "loss": 0.81380886, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.14855957, + "step": 4284, + "time_per_iteration": 2.7099735736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007982, + "balance_loss_mlp": 1.00154495, + "epoch": 0.8243555213543671, + "flos": 1539426290688.0, + "grad_norm": 0.005169205601206867, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78942251, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.06445312, + "step": 4285, + "time_per_iteration": 4.925944089889526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065516, + "balance_loss_mlp": 1.05109131, + "epoch": 0.8245479030396307, + "flos": 797429836800.0, + "grad_norm": 0.07826077018857239, + "language_loss": 0.82661068, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83726579, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.14428711, + "step": 4286, + "time_per_iteration": 3.1125218868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068802, + "balance_loss_mlp": 1.05397248, + "epoch": 0.8247402847248941, + "flos": 646114185216.0, + "grad_norm": 0.06888456798344761, + "language_loss": 0.76415771, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77484572, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.14807129, + "step": 4287, + "time_per_iteration": 2.893860101699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068506, + "balance_loss_mlp": 1.05348611, + "epoch": 0.8249326664101577, + "flos": 604421595648.0, + "grad_norm": 0.06182947496579068, + "language_loss": 0.79757684, + "learning_rate": 7.824384081587637e-05, + "loss": 0.80826187, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.15002441, + "step": 4288, + "time_per_iteration": 2.8073134422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010702, + "balance_loss_mlp": 1.0554297, + "epoch": 0.8251250480954213, + "flos": 824369218560.0, + "grad_norm": 0.08909700338283992, + "language_loss": 0.86458504, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87528706, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.14746094, + "step": 4289, + "time_per_iteration": 3.1265206336975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.04926586, + "epoch": 0.8253174297806849, + "flos": 757382897664.0, + "grad_norm": 0.07722312051706566, + "language_loss": 0.78082144, + "learning_rate": 7.790950350913112e-05, + "loss": 0.7914592, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.1451416, + "step": 4290, + "time_per_iteration": 2.919142246246338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070465, + "balance_loss_mlp": 1.05595672, + "epoch": 0.8255098114659485, + "flos": 794469648384.0, + "grad_norm": 0.06822496505203762, + "language_loss": 0.87448025, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88518488, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.14489746, + "step": 4291, + "time_per_iteration": 3.1968111991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063883, + "balance_loss_mlp": 1.04919672, + "epoch": 0.825702193151212, + "flos": 710417475072.0, + "grad_norm": 0.11140980724884261, + "language_loss": 0.77158391, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78222275, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.14672852, + "step": 4292, + "time_per_iteration": 2.864590883255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064183, + "balance_loss_mlp": 1.04980659, + "epoch": 0.8258945748364755, + "flos": 683394029568.0, + "grad_norm": 0.07528844179366555, + "language_loss": 0.80776614, + "learning_rate": 7.740922673634537e-05, + "loss": 0.81840801, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.14379883, + "step": 4293, + "time_per_iteration": 2.9787964820861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069315, + "balance_loss_mlp": 1.05462837, + "epoch": 0.8260869565217391, + "flos": 594563120640.0, + "grad_norm": 0.07232173047564831, + "language_loss": 0.78854036, + "learning_rate": 7.724279585440186e-05, + "loss": 0.79923344, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.14660645, + "step": 4294, + "time_per_iteration": 2.737032175064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063563, + "balance_loss_mlp": 1.04878104, + "epoch": 0.8262793382070027, + "flos": 651480993792.0, + "grad_norm": 0.11834626543573872, + "language_loss": 0.85043526, + "learning_rate": 7.707652910136098e-05, + "loss": 0.86107087, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.14758301, + "step": 4295, + "time_per_iteration": 2.7672622203826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067425, + "balance_loss_mlp": 1.05227339, + "epoch": 0.8264717198922663, + "flos": 538922709504.0, + "grad_norm": 0.07320373612786368, + "language_loss": 0.85068297, + "learning_rate": 7.691042654177315e-05, + "loss": 0.86135721, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.15136719, + "step": 4296, + "time_per_iteration": 2.727430820465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067722, + "balance_loss_mlp": 1.05339313, + "epoch": 0.8266641015775298, + "flos": 538949873664.0, + "grad_norm": 0.08277618732225704, + "language_loss": 0.75727075, + "learning_rate": 7.674448824012514e-05, + "loss": 0.76794797, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.14331055, + "step": 4297, + "time_per_iteration": 2.6567587852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068847, + "balance_loss_mlp": 1.0540297, + "epoch": 0.8268564832627934, + "flos": 585361728000.0, + "grad_norm": 0.0640063597091925, + "language_loss": 0.83917528, + "learning_rate": 7.657871426083979e-05, + "loss": 0.84986377, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.14794922, + "step": 4298, + "time_per_iteration": 2.7982728481292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_mlp": 1.0529747, + "epoch": 0.827048864948057, + "flos": 430661173248.0, + "grad_norm": 0.0794186132350224, + "language_loss": 0.84216493, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85284323, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.14831543, + "step": 4299, + "time_per_iteration": 2.479512929916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062096, + "balance_loss_mlp": 1.04768384, + "epoch": 0.8272412466333205, + "flos": 1388430761472.0, + "grad_norm": 0.07007017286970613, + "language_loss": 0.84912431, + "learning_rate": 7.624765952673069e-05, + "loss": 0.85974526, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.14379883, + "step": 4300, + "time_per_iteration": 3.7502307891845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065217, + "balance_loss_mlp": 1.05036318, + "epoch": 0.827433628318584, + "flos": 538230749184.0, + "grad_norm": 0.07093314756635549, + "language_loss": 0.82853031, + "learning_rate": 7.608237890043335e-05, + "loss": 0.8391825, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.1484375, + "step": 4301, + "time_per_iteration": 2.697632312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061301, + "balance_loss_mlp": 1.04642415, + "epoch": 0.8276260100038476, + "flos": 730734981120.0, + "grad_norm": 0.067461781222512, + "language_loss": 0.77062577, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78123879, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.14855957, + "step": 4302, + "time_per_iteration": 2.9730281829833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069537, + "balance_loss_mlp": 1.05501771, + "epoch": 0.8278183916891112, + "flos": 871102273536.0, + "grad_norm": 0.0590298560947334, + "language_loss": 0.82669955, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83739495, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.14501953, + "step": 4303, + "time_per_iteration": 3.175729274749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066264, + "balance_loss_mlp": 1.05120802, + "epoch": 0.8280107733743748, + "flos": 594543297024.0, + "grad_norm": 0.05865024398378704, + "language_loss": 0.77674329, + "learning_rate": 7.558752475439134e-05, + "loss": 0.78740591, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.15039062, + "step": 4304, + "time_per_iteration": 2.7806692123413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071171, + "balance_loss_mlp": 1.05629373, + "epoch": 0.8282031550596384, + "flos": 768607667712.0, + "grad_norm": 0.06803152802988026, + "language_loss": 0.84490967, + "learning_rate": 7.542290283012653e-05, + "loss": 0.8556214, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.14868164, + "step": 4305, + "time_per_iteration": 3.042027711868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065414, + "balance_loss_mlp": 1.05056047, + "epoch": 0.8283955367449019, + "flos": 696108805632.0, + "grad_norm": 0.07027931411926491, + "language_loss": 0.77876532, + "learning_rate": 7.525844574130947e-05, + "loss": 0.78941941, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.14831543, + "step": 4306, + "time_per_iteration": 3.0258917808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067598, + "balance_loss_mlp": 1.0529592, + "epoch": 0.8285879184301654, + "flos": 660630256128.0, + "grad_norm": 0.06295078199718337, + "language_loss": 0.82822084, + "learning_rate": 7.509415355178806e-05, + "loss": 0.83889681, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.14611816, + "step": 4307, + "time_per_iteration": 2.9383127689361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067189, + "balance_loss_mlp": 1.05246627, + "epoch": 0.828780300115429, + "flos": 558709042176.0, + "grad_norm": 0.08802993540008418, + "language_loss": 0.77530718, + "learning_rate": 7.493002632534618e-05, + "loss": 0.78597909, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.14709473, + "step": 4308, + "time_per_iteration": 2.690993547439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066796, + "balance_loss_mlp": 1.05171561, + "epoch": 0.8289726818006926, + "flos": 830963930112.0, + "grad_norm": 0.07318550442475504, + "language_loss": 0.82053602, + "learning_rate": 7.476606412570352e-05, + "loss": 0.83120394, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.15063477, + "step": 4309, + "time_per_iteration": 3.061457872390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068785, + "balance_loss_mlp": 1.05449212, + "epoch": 0.8291650634859561, + "flos": 732289227264.0, + "grad_norm": 0.10021622774197819, + "language_loss": 0.80771077, + "learning_rate": 7.460226701651624e-05, + "loss": 0.81839859, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.1427002, + "step": 4310, + "time_per_iteration": 2.9217689037323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069318, + "balance_loss_mlp": 1.0544883, + "epoch": 0.8293574451712197, + "flos": 860910114816.0, + "grad_norm": 0.07749506282182811, + "language_loss": 0.8143084, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82500154, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.14807129, + "step": 4311, + "time_per_iteration": 3.2195286750793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106071, + "balance_loss_mlp": 1.04607165, + "epoch": 0.8295498268564833, + "flos": 495156810240.0, + "grad_norm": 0.055714920992617885, + "language_loss": 0.81537104, + "learning_rate": 7.427516832380948e-05, + "loss": 0.8259781, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.14611816, + "step": 4312, + "time_per_iteration": 2.894439220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070264, + "balance_loss_mlp": 1.05601811, + "epoch": 0.8297422085417469, + "flos": 554471391744.0, + "grad_norm": 0.06262478668438266, + "language_loss": 0.77979529, + "learning_rate": 7.4111866867281e-05, + "loss": 0.7904979, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.14233398, + "step": 4313, + "time_per_iteration": 2.79099440574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106636, + "balance_loss_mlp": 1.0517329, + "epoch": 0.8299345902270104, + "flos": 1247497417728.0, + "grad_norm": 0.07618762117958246, + "language_loss": 0.77313519, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78379875, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.14624023, + "step": 4314, + "time_per_iteration": 3.6615333557128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072423, + "balance_loss_mlp": 1.05779576, + "epoch": 0.8301269719122739, + "flos": 585260411904.0, + "grad_norm": 0.18397477745179813, + "language_loss": 0.82993805, + "learning_rate": 7.378576005087034e-05, + "loss": 0.8406623, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.14611816, + "step": 4315, + "time_per_iteration": 2.8126580715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072538, + "balance_loss_mlp": 1.05780363, + "epoch": 0.8303193535975375, + "flos": 509732352000.0, + "grad_norm": 0.08855740032604588, + "language_loss": 0.84620678, + "learning_rate": 7.362295481759412e-05, + "loss": 0.8569321, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.1472168, + "step": 4316, + "time_per_iteration": 2.6704373359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071184, + "balance_loss_mlp": 1.05666399, + "epoch": 0.8305117352828011, + "flos": 580652375040.0, + "grad_norm": 0.0829439330873515, + "language_loss": 0.8352679, + "learning_rate": 7.346031511856722e-05, + "loss": 0.84597969, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.14526367, + "step": 4317, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068221, + "balance_loss_mlp": 1.05336761, + "epoch": 0.8307041169680647, + "flos": 481626736128.0, + "grad_norm": 0.07562403040012457, + "language_loss": 0.78876424, + "learning_rate": 7.329784101693232e-05, + "loss": 0.7994464, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.14831543, + "step": 4318, + "time_per_iteration": 2.674924373626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072061, + "balance_loss_mlp": 1.05757725, + "epoch": 0.8308964986533282, + "flos": 624605852160.0, + "grad_norm": 0.17247227675142032, + "language_loss": 0.82843518, + "learning_rate": 7.313553257576727e-05, + "loss": 0.83915579, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.14465332, + "step": 4319, + "time_per_iteration": 2.780308723449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107238, + "balance_loss_mlp": 1.0574789, + "epoch": 0.8310888803385917, + "flos": 827319495168.0, + "grad_norm": 0.07679767527195203, + "language_loss": 0.78869575, + "learning_rate": 7.297338985808589e-05, + "loss": 0.79941958, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.14880371, + "step": 4320, + "time_per_iteration": 3.001223087310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.06415725, + "epoch": 0.8312812620238553, + "flos": 583743241728.0, + "grad_norm": 0.07050095475557064, + "language_loss": 0.8173933, + "learning_rate": 7.281141292683746e-05, + "loss": 0.8281799, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.14501953, + "step": 4321, + "time_per_iteration": 2.8004937171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077036, + "balance_loss_mlp": 1.06259966, + "epoch": 0.8314736437091189, + "flos": 1115605052928.0, + "grad_norm": 0.07881560078697845, + "language_loss": 0.74389625, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75466657, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.14428711, + "step": 4322, + "time_per_iteration": 3.427699565887451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075197, + "balance_loss_mlp": 1.06103539, + "epoch": 0.8316660253943825, + "flos": 517547625984.0, + "grad_norm": 0.07598389174722883, + "language_loss": 0.81921697, + "learning_rate": 7.248795667511543e-05, + "loss": 0.82996899, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.14172363, + "step": 4323, + "time_per_iteration": 2.7954294681549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076668, + "balance_loss_mlp": 1.06266105, + "epoch": 0.831858407079646, + "flos": 795329736192.0, + "grad_norm": 0.09151709224743419, + "language_loss": 0.7770648, + "learning_rate": 7.232647748021864e-05, + "loss": 0.78783149, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.14025879, + "step": 4324, + "time_per_iteration": 3.0391266345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107714, + "balance_loss_mlp": 1.06284642, + "epoch": 0.8320507887649096, + "flos": 549967242240.0, + "grad_norm": 0.0787637779106886, + "language_loss": 0.83117342, + "learning_rate": 7.216516432290843e-05, + "loss": 0.84194481, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.14282227, + "step": 4325, + "time_per_iteration": 2.715921640396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077349, + "balance_loss_mlp": 1.06300807, + "epoch": 0.8322431704501732, + "flos": 479398155264.0, + "grad_norm": 0.07342050905632894, + "language_loss": 0.82109582, + "learning_rate": 7.20040172658123e-05, + "loss": 0.8318693, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.14331055, + "step": 4326, + "time_per_iteration": 2.5846447944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079059, + "balance_loss_mlp": 1.0648613, + "epoch": 0.8324355521354367, + "flos": 572434407936.0, + "grad_norm": 0.08685227783658636, + "language_loss": 0.85463101, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86542159, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.14208984, + "step": 4327, + "time_per_iteration": 2.7091739177703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080456, + "balance_loss_mlp": 1.066401, + "epoch": 0.8326279338207002, + "flos": 503454071808.0, + "grad_norm": 0.06549925067141421, + "language_loss": 0.82185209, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83265662, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.14050293, + "step": 4328, + "time_per_iteration": 2.5954463481903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079519, + "balance_loss_mlp": 1.06559563, + "epoch": 0.8328203155059638, + "flos": 605743474176.0, + "grad_norm": 0.06612642563497466, + "language_loss": 0.80887103, + "learning_rate": 7.152157332111364e-05, + "loss": 0.81966615, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.13928223, + "step": 4329, + "time_per_iteration": 2.91013240814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078449, + "balance_loss_mlp": 1.06431079, + "epoch": 0.8330126971912274, + "flos": 697798872576.0, + "grad_norm": 0.07964779047842838, + "language_loss": 0.85973161, + "learning_rate": 7.136109128985663e-05, + "loss": 0.87051612, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.14147949, + "step": 4330, + "time_per_iteration": 2.9252800941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.06949401, + "epoch": 0.833205078876491, + "flos": 494042706432.0, + "grad_norm": 0.07334420274354847, + "language_loss": 0.86698532, + "learning_rate": 7.120077567098249e-05, + "loss": 0.87782007, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.13977051, + "step": 4331, + "time_per_iteration": 2.65694522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107354, + "balance_loss_mlp": 1.0595088, + "epoch": 0.8333974605617546, + "flos": 482812793856.0, + "grad_norm": 0.0626600317816662, + "language_loss": 0.82693064, + "learning_rate": 7.104062652673115e-05, + "loss": 0.83766603, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.14038086, + "step": 4332, + "time_per_iteration": 2.7553460597991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082446, + "balance_loss_mlp": 1.06805778, + "epoch": 0.833589842247018, + "flos": 686821151232.0, + "grad_norm": 0.08703524611699036, + "language_loss": 0.82555664, + "learning_rate": 7.088064391927818e-05, + "loss": 0.83638108, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.1439209, + "step": 4333, + "time_per_iteration": 2.828909397125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107505, + "balance_loss_mlp": 1.06068492, + "epoch": 0.8337822239322816, + "flos": 881739343872.0, + "grad_norm": 0.0819256687419709, + "language_loss": 0.8264882, + "learning_rate": 7.072082791073419e-05, + "loss": 0.83723867, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.14367676, + "step": 4334, + "time_per_iteration": 3.081200361251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077915, + "balance_loss_mlp": 1.06354976, + "epoch": 0.8339746056175452, + "flos": 497183132160.0, + "grad_norm": 0.06916085041313896, + "language_loss": 0.82657623, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83735543, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.14355469, + "step": 4335, + "time_per_iteration": 2.6069602966308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079649, + "balance_loss_mlp": 1.06497395, + "epoch": 0.8341669873028088, + "flos": 510495892992.0, + "grad_norm": 0.13056988952609092, + "language_loss": 0.86229324, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87308979, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.14660645, + "step": 4336, + "time_per_iteration": 2.591599225997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074785, + "balance_loss_mlp": 1.06100416, + "epoch": 0.8343593689880723, + "flos": 692321209344.0, + "grad_norm": 0.13647106986897586, + "language_loss": 0.84314466, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85389245, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.13818359, + "step": 4337, + "time_per_iteration": 2.8121745586395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074468, + "balance_loss_mlp": 1.06028175, + "epoch": 0.8345517506733359, + "flos": 552408367104.0, + "grad_norm": 0.07949021649042014, + "language_loss": 0.78494132, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79568601, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.14172363, + "step": 4338, + "time_per_iteration": 2.7921485900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.05995309, + "epoch": 0.8347441323585995, + "flos": 592052613120.0, + "grad_norm": 0.08099810824139689, + "language_loss": 0.76340652, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77415156, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.14550781, + "step": 4339, + "time_per_iteration": 2.8709957599639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077425, + "balance_loss_mlp": 1.06347775, + "epoch": 0.834936514043863, + "flos": 614917702656.0, + "grad_norm": 0.06378238002097801, + "language_loss": 0.84410638, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85488063, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.13964844, + "step": 4340, + "time_per_iteration": 2.763047456741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074413, + "balance_loss_mlp": 1.06045377, + "epoch": 0.8351288957291266, + "flos": 467844470784.0, + "grad_norm": 0.08868106733466218, + "language_loss": 0.79730743, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80805159, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.13964844, + "step": 4341, + "time_per_iteration": 2.5980849266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076452, + "balance_loss_mlp": 1.06226587, + "epoch": 0.8353212774143901, + "flos": 509319747072.0, + "grad_norm": 0.0623240268119806, + "language_loss": 0.78920925, + "learning_rate": 6.944830483504328e-05, + "loss": 0.79997373, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.14196777, + "step": 4342, + "time_per_iteration": 2.643486261367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070423, + "balance_loss_mlp": 1.05589139, + "epoch": 0.8355136590996537, + "flos": 687784753152.0, + "grad_norm": 0.06845892903357542, + "language_loss": 0.80452394, + "learning_rate": 6.928999100098483e-05, + "loss": 0.81522822, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.1451416, + "step": 4343, + "time_per_iteration": 2.865501880645752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073025, + "balance_loss_mlp": 1.0586009, + "epoch": 0.8357060407849173, + "flos": 984409417728.0, + "grad_norm": 0.06662147764559252, + "language_loss": 0.83445907, + "learning_rate": 6.913184438338138e-05, + "loss": 0.84518933, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.14416504, + "step": 4344, + "time_per_iteration": 3.2890896797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076846, + "balance_loss_mlp": 1.06261289, + "epoch": 0.8358984224701809, + "flos": 843026393088.0, + "grad_norm": 0.0775623311546164, + "language_loss": 0.85284698, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86361539, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.14245605, + "step": 4345, + "time_per_iteration": 3.215787410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074225, + "balance_loss_mlp": 1.05949068, + "epoch": 0.8360908041554445, + "flos": 626239019520.0, + "grad_norm": 0.07651611032194032, + "language_loss": 0.82082218, + "learning_rate": 6.881605304306748e-05, + "loss": 0.83156443, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.14709473, + "step": 4346, + "time_per_iteration": 2.781648635864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070263, + "balance_loss_mlp": 1.05558789, + "epoch": 0.8362831858407079, + "flos": 576068931072.0, + "grad_norm": 0.06989910685686435, + "language_loss": 0.84813631, + "learning_rate": 6.865840844295796e-05, + "loss": 0.85883898, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.14660645, + "step": 4347, + "time_per_iteration": 2.784560203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078519, + "balance_loss_mlp": 1.06411839, + "epoch": 0.8364755675259715, + "flos": 833783155200.0, + "grad_norm": 0.12496364806049359, + "language_loss": 0.80586934, + "learning_rate": 6.850093130450569e-05, + "loss": 0.8166545, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.1439209, + "step": 4348, + "time_per_iteration": 3.0966875553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074403, + "balance_loss_mlp": 1.05977595, + "epoch": 0.8366679492112351, + "flos": 582480834048.0, + "grad_norm": 0.07680322058687222, + "language_loss": 0.86201406, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87275803, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.14624023, + "step": 4349, + "time_per_iteration": 2.688755989074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076536, + "balance_loss_mlp": 1.06240952, + "epoch": 0.8368603308964987, + "flos": 611722948608.0, + "grad_norm": 0.08191236295522616, + "language_loss": 0.87558603, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88635135, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.14123535, + "step": 4350, + "time_per_iteration": 2.7902283668518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073123, + "balance_loss_mlp": 1.05873418, + "epoch": 0.8370527125817622, + "flos": 507264062976.0, + "grad_norm": 0.06833066188081044, + "language_loss": 0.85545194, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86618322, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.14355469, + "step": 4351, + "time_per_iteration": 2.754146099090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106708, + "balance_loss_mlp": 1.05251265, + "epoch": 0.8372450942670258, + "flos": 770952619008.0, + "grad_norm": 0.07146969997883827, + "language_loss": 0.82481229, + "learning_rate": 6.787269858905603e-05, + "loss": 0.83548313, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.14550781, + "step": 4352, + "time_per_iteration": 2.94331693649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073282, + "balance_loss_mlp": 1.05865479, + "epoch": 0.8374374759522893, + "flos": 579276168192.0, + "grad_norm": 0.07308977517607267, + "language_loss": 0.85184574, + "learning_rate": 6.771605967466033e-05, + "loss": 0.86257857, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.14611816, + "step": 4353, + "time_per_iteration": 2.693153142929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107036, + "balance_loss_mlp": 1.0557214, + "epoch": 0.8376298576375529, + "flos": 788129699328.0, + "grad_norm": 0.10269547820167589, + "language_loss": 0.82213604, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83283961, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.14624023, + "step": 4354, + "time_per_iteration": 3.0104711055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071035, + "balance_loss_mlp": 1.05653942, + "epoch": 0.8378222393228165, + "flos": 577613265408.0, + "grad_norm": 0.06911393067496661, + "language_loss": 0.80482757, + "learning_rate": 6.74032853891452e-05, + "loss": 0.81553793, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.14477539, + "step": 4355, + "time_per_iteration": 2.755267858505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069203, + "balance_loss_mlp": 1.05463576, + "epoch": 0.83801462100808, + "flos": 480865766400.0, + "grad_norm": 0.07252144879258707, + "language_loss": 0.8209852, + "learning_rate": 6.724715013945548e-05, + "loss": 0.8316772, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.14550781, + "step": 4356, + "time_per_iteration": 2.6092493534088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068662, + "balance_loss_mlp": 1.05458319, + "epoch": 0.8382070026933436, + "flos": 550817044992.0, + "grad_norm": 0.07005511647028967, + "language_loss": 0.89297009, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90365666, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.14074707, + "step": 4357, + "time_per_iteration": 2.8237545490264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069868, + "balance_loss_mlp": 1.0553124, + "epoch": 0.8383993843786072, + "flos": 624968898048.0, + "grad_norm": 0.07260980188745762, + "language_loss": 0.82167578, + "learning_rate": 6.693538372929725e-05, + "loss": 0.83237451, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.14538574, + "step": 4358, + "time_per_iteration": 2.9932587146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070372, + "balance_loss_mlp": 1.0557332, + "epoch": 0.8385917660638708, + "flos": 491169153024.0, + "grad_norm": 0.13657826580523555, + "language_loss": 0.86348242, + "learning_rate": 6.677975268986719e-05, + "loss": 0.8741861, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.14611816, + "step": 4359, + "time_per_iteration": 2.6329987049102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071816, + "balance_loss_mlp": 1.05714154, + "epoch": 0.8387841477491342, + "flos": 466900692480.0, + "grad_norm": 0.07525835690119967, + "language_loss": 0.87460434, + "learning_rate": 6.662428984145336e-05, + "loss": 0.88532257, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.14660645, + "step": 4360, + "time_per_iteration": 2.627370834350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022038, + "balance_loss_mlp": 1.01560092, + "epoch": 0.8389765294343978, + "flos": 1564188475392.0, + "grad_norm": 0.012085873021789567, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72802228, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.06445312, + "step": 4361, + "time_per_iteration": 5.010459899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_mlp": 1.05930793, + "epoch": 0.8391689111196614, + "flos": 602160708096.0, + "grad_norm": 0.0572272886330789, + "language_loss": 0.82823777, + "learning_rate": 6.631386895903308e-05, + "loss": 0.83897257, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.14160156, + "step": 4362, + "time_per_iteration": 2.922370195388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073502, + "balance_loss_mlp": 1.05868399, + "epoch": 0.839361292804925, + "flos": 443047408128.0, + "grad_norm": 0.07860182159068019, + "language_loss": 0.80037236, + "learning_rate": 6.615891104554261e-05, + "loss": 0.8111074, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.14807129, + "step": 4363, + "time_per_iteration": 2.502601146697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072227, + "balance_loss_mlp": 1.05711174, + "epoch": 0.8395536744901886, + "flos": 594167768064.0, + "grad_norm": 0.07291966269797463, + "language_loss": 0.82605469, + "learning_rate": 6.600412156410057e-05, + "loss": 0.83677697, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.15100098, + "step": 4364, + "time_per_iteration": 2.713050365447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076545, + "balance_loss_mlp": 1.06210876, + "epoch": 0.8397460561754521, + "flos": 889836171264.0, + "grad_norm": 0.07837593762341759, + "language_loss": 0.84887516, + "learning_rate": 6.58495005748016e-05, + "loss": 0.8596406, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.14416504, + "step": 4365, + "time_per_iteration": 3.1587257385253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075757, + "balance_loss_mlp": 1.06149936, + "epoch": 0.8399384378607156, + "flos": 553503020544.0, + "grad_norm": 0.06763724554244926, + "language_loss": 0.89107072, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90182829, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.14257812, + "step": 4366, + "time_per_iteration": 2.629777193069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107536, + "balance_loss_mlp": 1.06095958, + "epoch": 0.8401308195459792, + "flos": 518923832832.0, + "grad_norm": 0.061847950182012404, + "language_loss": 0.83264184, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84339547, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.14404297, + "step": 4367, + "time_per_iteration": 2.659771680831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107662, + "balance_loss_mlp": 1.06249356, + "epoch": 0.8403232012312428, + "flos": 684933221376.0, + "grad_norm": 0.07038928746315512, + "language_loss": 0.80698526, + "learning_rate": 6.538664915972648e-05, + "loss": 0.81775153, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.14123535, + "step": 4368, + "time_per_iteration": 3.017886161804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072638, + "balance_loss_mlp": 1.0580225, + "epoch": 0.8405155829165063, + "flos": 577672736256.0, + "grad_norm": 0.07391469226483313, + "language_loss": 0.77268881, + "learning_rate": 6.523270273863652e-05, + "loss": 0.7834152, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.14587402, + "step": 4369, + "time_per_iteration": 2.6887683868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073294, + "balance_loss_mlp": 1.05898881, + "epoch": 0.8407079646017699, + "flos": 456627041280.0, + "grad_norm": 0.12071561647223925, + "language_loss": 0.87840384, + "learning_rate": 6.507892510918079e-05, + "loss": 0.88913679, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.14294434, + "step": 4370, + "time_per_iteration": 2.521331548690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073523, + "balance_loss_mlp": 1.05930161, + "epoch": 0.8409003462870335, + "flos": 534917426688.0, + "grad_norm": 0.07405697321132997, + "language_loss": 0.81616879, + "learning_rate": 6.492531633106114e-05, + "loss": 0.82690406, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.14221191, + "step": 4371, + "time_per_iteration": 2.8012852668762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076911, + "balance_loss_mlp": 1.06248665, + "epoch": 0.8410927279722971, + "flos": 556759443456.0, + "grad_norm": 0.17788784846398228, + "language_loss": 0.77741635, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78818548, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.14404297, + "step": 4372, + "time_per_iteration": 2.7866506576538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023226, + "balance_loss_mlp": 1.01678848, + "epoch": 0.8412851096575606, + "flos": 1549754270208.0, + "grad_norm": 0.01277325762112691, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78702348, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.06445312, + "step": 4373, + "time_per_iteration": 4.969724655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079668, + "balance_loss_mlp": 1.06552935, + "epoch": 0.8414774913428241, + "flos": 552042749952.0, + "grad_norm": 0.10963981895984921, + "language_loss": 0.79011232, + "learning_rate": 6.446550370075271e-05, + "loss": 0.80090904, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.14147949, + "step": 4374, + "time_per_iteration": 2.7151315212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079428, + "balance_loss_mlp": 1.06480074, + "epoch": 0.8416698730280877, + "flos": 573015140352.0, + "grad_norm": 0.06677084771491004, + "language_loss": 0.77023661, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78103089, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.14611816, + "step": 4375, + "time_per_iteration": 2.6808011531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107382, + "balance_loss_mlp": 1.05907393, + "epoch": 0.8418622547133513, + "flos": 758731940352.0, + "grad_norm": 0.09075700701482696, + "language_loss": 0.80288577, + "learning_rate": 6.415980729547543e-05, + "loss": 0.81362402, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.1472168, + "step": 4376, + "time_per_iteration": 2.951115608215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075249, + "balance_loss_mlp": 1.0609082, + "epoch": 0.8420546363986149, + "flos": 1074156940800.0, + "grad_norm": 0.09043332509327401, + "language_loss": 0.72320813, + "learning_rate": 6.40072128754366e-05, + "loss": 0.73396063, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.14343262, + "step": 4377, + "time_per_iteration": 3.411957025527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075222, + "balance_loss_mlp": 1.06063056, + "epoch": 0.8422470180838784, + "flos": 525908754432.0, + "grad_norm": 0.09960608064306599, + "language_loss": 0.82466877, + "learning_rate": 6.385478772280933e-05, + "loss": 0.83542103, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.14575195, + "step": 4378, + "time_per_iteration": 2.7343966960906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074621, + "balance_loss_mlp": 1.06004119, + "epoch": 0.842439399769142, + "flos": 600834060288.0, + "grad_norm": 0.0684628860225588, + "language_loss": 0.82174343, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83248967, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.14562988, + "step": 4379, + "time_per_iteration": 2.743713140487671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078112, + "balance_loss_mlp": 1.06365216, + "epoch": 0.8426317814544055, + "flos": 552222987264.0, + "grad_norm": 0.07159027255471458, + "language_loss": 0.869488, + "learning_rate": 6.355044545643073e-05, + "loss": 0.88026911, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.14440918, + "step": 4380, + "time_per_iteration": 2.8095319271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076585, + "balance_loss_mlp": 1.06231618, + "epoch": 0.8428241631396691, + "flos": 678832980480.0, + "grad_norm": 0.07156323252818027, + "language_loss": 0.77553236, + "learning_rate": 6.33985284608356e-05, + "loss": 0.78629822, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.1427002, + "step": 4381, + "time_per_iteration": 2.8225574493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079603, + "balance_loss_mlp": 1.06550026, + "epoch": 0.8430165448249327, + "flos": 753730748928.0, + "grad_norm": 0.060495968283249074, + "language_loss": 0.79683161, + "learning_rate": 6.324678096896435e-05, + "loss": 0.80762756, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.14099121, + "step": 4382, + "time_per_iteration": 3.090226650238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079802, + "balance_loss_mlp": 1.06586623, + "epoch": 0.8432089265101962, + "flos": 699140574720.0, + "grad_norm": 0.06822593281534445, + "language_loss": 0.80561733, + "learning_rate": 6.30952030397306e-05, + "loss": 0.81641531, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.13952637, + "step": 4383, + "time_per_iteration": 2.902010917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_mlp": 1.06318569, + "epoch": 0.8434013081954598, + "flos": 485767839744.0, + "grad_norm": 0.0829760023739812, + "language_loss": 0.84329182, + "learning_rate": 6.294379473198208e-05, + "loss": 0.85406601, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.14233398, + "step": 4384, + "time_per_iteration": 2.672295570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077907, + "balance_loss_mlp": 1.06353092, + "epoch": 0.8435936898807234, + "flos": 520623811584.0, + "grad_norm": 0.09380658686475808, + "language_loss": 0.85271668, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86349577, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.14355469, + "step": 4385, + "time_per_iteration": 2.6639716625213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079599, + "balance_loss_mlp": 1.06513858, + "epoch": 0.843786071565987, + "flos": 785945534976.0, + "grad_norm": 0.07988119482228719, + "language_loss": 0.80665654, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81745255, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.14453125, + "step": 4386, + "time_per_iteration": 3.0548393726348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020283, + "balance_loss_mlp": 1.0138458, + "epoch": 0.8439784532512504, + "flos": 1446278436864.0, + "grad_norm": 0.009203156956610654, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76856798, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.06445312, + "step": 4387, + "time_per_iteration": 4.945947170257568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107942, + "balance_loss_mlp": 1.06507921, + "epoch": 0.844170834936514, + "flos": 708700243968.0, + "grad_norm": 0.08582903171575712, + "language_loss": 0.82610214, + "learning_rate": 6.23398588904906e-05, + "loss": 0.8368963, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.14343262, + "step": 4388, + "time_per_iteration": 2.879181385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079902, + "balance_loss_mlp": 1.06535816, + "epoch": 0.8443632166217776, + "flos": 483428030976.0, + "grad_norm": 0.07348538767622947, + "language_loss": 0.79642034, + "learning_rate": 6.218929957057922e-05, + "loss": 0.80721939, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.14526367, + "step": 4389, + "time_per_iteration": 2.6795496940612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080974, + "balance_loss_mlp": 1.06649029, + "epoch": 0.8445555983070412, + "flos": 678694588416.0, + "grad_norm": 0.07673938165161245, + "language_loss": 0.80120802, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81201774, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.14453125, + "step": 4390, + "time_per_iteration": 2.8635592460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080009, + "balance_loss_mlp": 1.0658704, + "epoch": 0.8447479799923048, + "flos": 741485477376.0, + "grad_norm": 0.07689839370014714, + "language_loss": 0.7424233, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75322342, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.14135742, + "step": 4391, + "time_per_iteration": 2.977808952331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080831, + "balance_loss_mlp": 1.06648993, + "epoch": 0.8449403616775683, + "flos": 953306537472.0, + "grad_norm": 0.06854882269895202, + "language_loss": 0.80483949, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81564778, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.14343262, + "step": 4392, + "time_per_iteration": 3.2617368698120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083639, + "balance_loss_mlp": 1.06892824, + "epoch": 0.8451327433628318, + "flos": 657363921408.0, + "grad_norm": 0.08738597947785028, + "language_loss": 0.72036451, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73120093, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.14685059, + "step": 4393, + "time_per_iteration": 2.9041545391082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107506, + "balance_loss_mlp": 1.06110108, + "epoch": 0.8453251250480954, + "flos": 446113681920.0, + "grad_norm": 0.08852500649821744, + "language_loss": 0.83482921, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84557986, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.13977051, + "step": 4394, + "time_per_iteration": 2.5223376750946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079032, + "balance_loss_mlp": 1.06472635, + "epoch": 0.845517506733359, + "flos": 542767205376.0, + "grad_norm": 0.08312411172641776, + "language_loss": 0.71075082, + "learning_rate": 6.128951512927305e-05, + "loss": 0.72154111, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.14294434, + "step": 4395, + "time_per_iteration": 2.676872968673706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076651, + "balance_loss_mlp": 1.06210768, + "epoch": 0.8457098884186226, + "flos": 502440910848.0, + "grad_norm": 0.09142879827690771, + "language_loss": 0.84363878, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85440528, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.14526367, + "step": 4396, + "time_per_iteration": 2.6433725357055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078224, + "balance_loss_mlp": 1.06416929, + "epoch": 0.8459022701038861, + "flos": 448893259776.0, + "grad_norm": 0.0794015178696456, + "language_loss": 0.79516685, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80594903, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.14050293, + "step": 4397, + "time_per_iteration": 2.7340524196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072834, + "balance_loss_mlp": 1.05860019, + "epoch": 0.8460946517891497, + "flos": 743178115584.0, + "grad_norm": 0.0800433568929215, + "language_loss": 0.75171196, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.76244032, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.14233398, + "step": 4398, + "time_per_iteration": 2.9435505867004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076139, + "balance_loss_mlp": 1.06173849, + "epoch": 0.8462870334744133, + "flos": 553216324608.0, + "grad_norm": 0.10324502308758304, + "language_loss": 0.79907882, + "learning_rate": 6.069306450876389e-05, + "loss": 0.8098402, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.14379883, + "step": 4399, + "time_per_iteration": 2.844953775405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019199, + "balance_loss_mlp": 1.01285696, + "epoch": 0.8464794151596768, + "flos": 1564877864448.0, + "grad_norm": 0.008987182003831137, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82727766, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.06347656, + "step": 4400, + "time_per_iteration": 4.9445812702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072123, + "balance_loss_mlp": 1.05743694, + "epoch": 0.8466717968449403, + "flos": 550197038592.0, + "grad_norm": 0.06222883807624679, + "language_loss": 0.79746759, + "learning_rate": 6.039586229158084e-05, + "loss": 0.80818892, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.14685059, + "step": 4401, + "time_per_iteration": 2.7119193077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074863, + "balance_loss_mlp": 1.06054568, + "epoch": 0.8468641785302039, + "flos": 551919038976.0, + "grad_norm": 0.06716515000041562, + "language_loss": 0.84632695, + "learning_rate": 6.024751715835314e-05, + "loss": 0.85707557, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.14294434, + "step": 4402, + "time_per_iteration": 2.781859874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072431, + "balance_loss_mlp": 1.05786383, + "epoch": 0.8470565602154675, + "flos": 572671544832.0, + "grad_norm": 0.14264875428102675, + "language_loss": 0.87237591, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88310021, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.14550781, + "step": 4403, + "time_per_iteration": 2.743601083755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_mlp": 1.06340051, + "epoch": 0.8472489419007311, + "flos": 472833179136.0, + "grad_norm": 0.08442038658204883, + "language_loss": 0.83985877, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85063827, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.14526367, + "step": 4404, + "time_per_iteration": 2.5450549125671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076393, + "balance_loss_mlp": 1.062374, + "epoch": 0.8474413235859947, + "flos": 798020481024.0, + "grad_norm": 0.06525598826964277, + "language_loss": 0.795784, + "learning_rate": 5.980350635103954e-05, + "loss": 0.80654788, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.14025879, + "step": 4405, + "time_per_iteration": 2.9938158988952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077872, + "balance_loss_mlp": 1.06393683, + "epoch": 0.8476337052712581, + "flos": 502379241984.0, + "grad_norm": 0.07458633653372311, + "language_loss": 0.80359912, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.8143779, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.13934326, + "step": 4406, + "time_per_iteration": 2.5586020946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074026, + "balance_loss_mlp": 1.05969727, + "epoch": 0.8478260869565217, + "flos": 931971101184.0, + "grad_norm": 0.0649551452480515, + "language_loss": 0.83187521, + "learning_rate": 5.9508353547573e-05, + "loss": 0.84261543, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.14343262, + "step": 4407, + "time_per_iteration": 3.2481842041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077659, + "balance_loss_mlp": 1.0633297, + "epoch": 0.8480184686417853, + "flos": 708811471872.0, + "grad_norm": 0.0832752237181532, + "language_loss": 0.80765074, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.81842732, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.14306641, + "step": 4408, + "time_per_iteration": 2.901926279067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075621, + "balance_loss_mlp": 1.06122029, + "epoch": 0.8482108503270489, + "flos": 614440857600.0, + "grad_norm": 0.06678078731558451, + "language_loss": 0.8214063, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.8321625, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.14379883, + "step": 4409, + "time_per_iteration": 2.829897403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075038, + "balance_loss_mlp": 1.06055427, + "epoch": 0.8484032320123124, + "flos": 531016031232.0, + "grad_norm": 0.08086645135201266, + "language_loss": 0.82160944, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83235979, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.14477539, + "step": 4410, + "time_per_iteration": 2.660163164138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024272, + "balance_loss_mlp": 1.01792979, + "epoch": 0.848595613697576, + "flos": 1542776315904.0, + "grad_norm": 0.013103843821954883, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77321184, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.06347656, + "step": 4411, + "time_per_iteration": 4.932765007019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074439, + "balance_loss_mlp": 1.06016994, + "epoch": 0.8487879953828396, + "flos": 677342974464.0, + "grad_norm": 0.07691974451074937, + "language_loss": 0.737957, + "learning_rate": 5.877346528406635e-05, + "loss": 0.74870145, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.1427002, + "step": 4412, + "time_per_iteration": 2.9196579456329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070882, + "balance_loss_mlp": 1.05686271, + "epoch": 0.8489803770681031, + "flos": 503673956352.0, + "grad_norm": 0.0819904874112488, + "language_loss": 0.79105639, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.8017652, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.14025879, + "step": 4413, + "time_per_iteration": 2.661724328994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076909, + "balance_loss_mlp": 1.06291389, + "epoch": 0.8491727587533667, + "flos": 563186027520.0, + "grad_norm": 0.06775622187053532, + "language_loss": 0.77081567, + "learning_rate": 5.84807086750247e-05, + "loss": 0.78158486, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.14001465, + "step": 4414, + "time_per_iteration": 2.8016960620880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071963, + "balance_loss_mlp": 1.0574671, + "epoch": 0.8493651404386302, + "flos": 459784719360.0, + "grad_norm": 0.09984055773639101, + "language_loss": 0.7783742, + "learning_rate": 5.833458746159243e-05, + "loss": 0.78909385, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.14489746, + "step": 4415, + "time_per_iteration": 2.5576140880584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075711, + "balance_loss_mlp": 1.06144118, + "epoch": 0.8495575221238938, + "flos": 461170838016.0, + "grad_norm": 0.09739646427251167, + "language_loss": 0.81540161, + "learning_rate": 5.818863771788013e-05, + "loss": 0.82615876, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.14257812, + "step": 4416, + "time_per_iteration": 2.6097960472106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_mlp": 1.05833459, + "epoch": 0.8497499038091574, + "flos": 870712063488.0, + "grad_norm": 0.11039248920807271, + "language_loss": 0.81449503, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82521868, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.14038086, + "step": 4417, + "time_per_iteration": 3.0810201168060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071844, + "balance_loss_mlp": 1.05743134, + "epoch": 0.849942285494421, + "flos": 779600443392.0, + "grad_norm": 0.09244345650082934, + "language_loss": 0.78268075, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79339921, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.14404297, + "step": 4418, + "time_per_iteration": 3.004802703857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076267, + "balance_loss_mlp": 1.06206918, + "epoch": 0.8501346671796844, + "flos": 513816556032.0, + "grad_norm": 0.06901259436124493, + "language_loss": 0.85190952, + "learning_rate": 5.775181787135819e-05, + "loss": 0.86267221, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.1418457, + "step": 4419, + "time_per_iteration": 2.701456308364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075748, + "balance_loss_mlp": 1.06159818, + "epoch": 0.850327048864948, + "flos": 621445602816.0, + "grad_norm": 0.06970940414254242, + "language_loss": 0.83750409, + "learning_rate": 5.76065545724877e-05, + "loss": 0.84826154, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.14147949, + "step": 4420, + "time_per_iteration": 2.8450427055358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073069, + "balance_loss_mlp": 1.05829954, + "epoch": 0.8505194305502116, + "flos": 774221524992.0, + "grad_norm": 0.06343395396056568, + "language_loss": 0.79527402, + "learning_rate": 5.746146302598454e-05, + "loss": 0.8060047, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.14758301, + "step": 4421, + "time_per_iteration": 3.0368168354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010731, + "balance_loss_mlp": 1.05916452, + "epoch": 0.8507118122354752, + "flos": 465257613312.0, + "grad_norm": 0.06692154543848765, + "language_loss": 0.86414826, + "learning_rate": 5.731654328817859e-05, + "loss": 0.8748793, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.13964844, + "step": 4422, + "time_per_iteration": 2.5675909519195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080967, + "balance_loss_mlp": 1.06668544, + "epoch": 0.8509041939207388, + "flos": 534413417472.0, + "grad_norm": 0.06814499560191878, + "language_loss": 0.84655517, + "learning_rate": 5.717179541533257e-05, + "loss": 0.85736477, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.14282227, + "step": 4423, + "time_per_iteration": 2.6630845069885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074411, + "balance_loss_mlp": 1.06011748, + "epoch": 0.8510965756060023, + "flos": 583738472448.0, + "grad_norm": 0.07713370691386924, + "language_loss": 0.83968955, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85043365, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.14306641, + "step": 4424, + "time_per_iteration": 2.678980827331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071118, + "balance_loss_mlp": 1.05659819, + "epoch": 0.8512889572912659, + "flos": 600841400832.0, + "grad_norm": 0.06685200855630355, + "language_loss": 0.77975464, + "learning_rate": 5.688281548923796e-05, + "loss": 0.79046577, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.1451416, + "step": 4425, + "time_per_iteration": 2.7655956745147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070813, + "balance_loss_mlp": 1.05638838, + "epoch": 0.8514813389765294, + "flos": 654791745024.0, + "grad_norm": 0.07982187700581499, + "language_loss": 0.78191173, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79261982, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.14416504, + "step": 4426, + "time_per_iteration": 2.9217934608459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076173, + "balance_loss_mlp": 1.0619514, + "epoch": 0.851673720661793, + "flos": 429761811456.0, + "grad_norm": 0.1625431829590372, + "language_loss": 0.78373289, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.79449469, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.14221191, + "step": 4427, + "time_per_iteration": 2.58944034576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073431, + "balance_loss_mlp": 1.05903041, + "epoch": 0.8518661023470565, + "flos": 641572959744.0, + "grad_norm": 0.07087664669883431, + "language_loss": 0.79935998, + "learning_rate": 5.645063599002875e-05, + "loss": 0.8100943, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.1439209, + "step": 4428, + "time_per_iteration": 2.7852087020874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074309, + "balance_loss_mlp": 1.06025457, + "epoch": 0.8520584840323201, + "flos": 562143504384.0, + "grad_norm": 0.06571018676034746, + "language_loss": 0.79440582, + "learning_rate": 5.630692048472363e-05, + "loss": 0.8051489, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.140625, + "step": 4429, + "time_per_iteration": 2.6801624298095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070263, + "balance_loss_mlp": 1.05610132, + "epoch": 0.8522508657175837, + "flos": 527050395648.0, + "grad_norm": 0.07096995462733162, + "language_loss": 0.78549665, + "learning_rate": 5.61633772363489e-05, + "loss": 0.79619926, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.14147949, + "step": 4430, + "time_per_iteration": 2.6519312858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071162, + "balance_loss_mlp": 1.05683255, + "epoch": 0.8524432474028473, + "flos": 499120247808.0, + "grad_norm": 0.08116181214962478, + "language_loss": 0.80567259, + "learning_rate": 5.602000630063298e-05, + "loss": 0.8163842, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.14318848, + "step": 4431, + "time_per_iteration": 2.5764808654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069916, + "balance_loss_mlp": 1.05516994, + "epoch": 0.8526356290881109, + "flos": 421314048000.0, + "grad_norm": 0.0903842329917801, + "language_loss": 0.79655671, + "learning_rate": 5.587680773323706e-05, + "loss": 0.80725586, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.14709473, + "step": 4432, + "time_per_iteration": 2.488812208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107009, + "balance_loss_mlp": 1.0557611, + "epoch": 0.8528280107733743, + "flos": 507328303104.0, + "grad_norm": 0.0816751718621874, + "language_loss": 0.8067739, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.81747478, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.14331055, + "step": 4433, + "time_per_iteration": 2.6227025985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073907, + "balance_loss_mlp": 1.06001878, + "epoch": 0.8530203924586379, + "flos": 445893797376.0, + "grad_norm": 0.08095349591121923, + "language_loss": 0.82720852, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.83794761, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.13891602, + "step": 4434, + "time_per_iteration": 2.5052199363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069434, + "balance_loss_mlp": 1.05520046, + "epoch": 0.8532127741439015, + "flos": 657759273984.0, + "grad_norm": 0.07769115756981526, + "language_loss": 0.83331203, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84400636, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.14221191, + "step": 4435, + "time_per_iteration": 2.8407375812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066367, + "balance_loss_mlp": 1.05220532, + "epoch": 0.8534051558291651, + "flos": 536019420672.0, + "grad_norm": 0.06062923290615588, + "language_loss": 0.82938188, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.84004557, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.14160156, + "step": 4436, + "time_per_iteration": 2.721763849258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076869, + "balance_loss_mlp": 1.0626117, + "epoch": 0.8535975375144286, + "flos": 533000134656.0, + "grad_norm": 0.08849975131180282, + "language_loss": 0.79207104, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80283976, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.14257812, + "step": 4437, + "time_per_iteration": 2.714531898498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106703, + "balance_loss_mlp": 1.05290413, + "epoch": 0.8537899191996922, + "flos": 574141727232.0, + "grad_norm": 0.08108043439435358, + "language_loss": 0.8220486, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83271891, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.14123535, + "step": 4438, + "time_per_iteration": 2.6950736045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_mlp": 1.05088568, + "epoch": 0.8539823008849557, + "flos": 465007993344.0, + "grad_norm": 0.06606452080680034, + "language_loss": 0.83545029, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84610963, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.15014648, + "step": 4439, + "time_per_iteration": 2.6966865062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069848, + "balance_loss_mlp": 1.0552212, + "epoch": 0.8541746825702193, + "flos": 554713671168.0, + "grad_norm": 0.07917909499890975, + "language_loss": 0.81682485, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.82752335, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.14599609, + "step": 4440, + "time_per_iteration": 2.716801404953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067798, + "balance_loss_mlp": 1.05304027, + "epoch": 0.8543670642554829, + "flos": 546391816704.0, + "grad_norm": 0.07436951957293847, + "language_loss": 0.77523911, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.78591704, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.14733887, + "step": 4441, + "time_per_iteration": 2.8399622440338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071802, + "balance_loss_mlp": 1.05721068, + "epoch": 0.8545594459407464, + "flos": 512027744256.0, + "grad_norm": 0.07108815231458238, + "language_loss": 0.82236481, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83308291, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.14575195, + "step": 4442, + "time_per_iteration": 2.6311261653900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069344, + "balance_loss_mlp": 1.05497956, + "epoch": 0.85475182762601, + "flos": 421185567744.0, + "grad_norm": 0.0731157508212472, + "language_loss": 0.81597567, + "learning_rate": 5.431301565318786e-05, + "loss": 0.8266691, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.14355469, + "step": 4443, + "time_per_iteration": 2.499732255935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067376, + "balance_loss_mlp": 1.05295157, + "epoch": 0.8549442093112736, + "flos": 389435516928.0, + "grad_norm": 0.10168520026489293, + "language_loss": 0.77461678, + "learning_rate": 5.41718898228542e-05, + "loss": 0.78529054, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.14428711, + "step": 4444, + "time_per_iteration": 2.5191171169281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065539, + "balance_loss_mlp": 1.05132949, + "epoch": 0.8551365909965372, + "flos": 605926282752.0, + "grad_norm": 0.10020390821281198, + "language_loss": 0.79534721, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80600262, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.14196777, + "step": 4445, + "time_per_iteration": 2.80684757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_mlp": 1.05816031, + "epoch": 0.8553289726818007, + "flos": 504160713216.0, + "grad_norm": 0.06547914097019276, + "language_loss": 0.78441411, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.7951389, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.14294434, + "step": 4446, + "time_per_iteration": 2.5812063217163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070129, + "balance_loss_mlp": 1.05559802, + "epoch": 0.8555213543670642, + "flos": 557009063424.0, + "grad_norm": 0.0766106578320322, + "language_loss": 0.75942904, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77013028, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.14501953, + "step": 4447, + "time_per_iteration": 2.7502357959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066331, + "balance_loss_mlp": 1.05213356, + "epoch": 0.8557137360523278, + "flos": 548104278528.0, + "grad_norm": 0.06446025999572932, + "language_loss": 0.74926281, + "learning_rate": 5.360911790663775e-05, + "loss": 0.75992608, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.14196777, + "step": 4448, + "time_per_iteration": 2.619159698486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070306, + "balance_loss_mlp": 1.055691, + "epoch": 0.8559061177375914, + "flos": 728182628352.0, + "grad_norm": 0.06744228342977912, + "language_loss": 0.78711146, + "learning_rate": 5.346885805197238e-05, + "loss": 0.79781449, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.14611816, + "step": 4449, + "time_per_iteration": 2.975527286529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073651, + "balance_loss_mlp": 1.05888104, + "epoch": 0.856098499422855, + "flos": 535881028608.0, + "grad_norm": 0.09470809233033459, + "language_loss": 0.83172154, + "learning_rate": 5.332877155607085e-05, + "loss": 0.84245807, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.14758301, + "step": 4450, + "time_per_iteration": 2.6913669109344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072051, + "balance_loss_mlp": 1.05720961, + "epoch": 0.8562908811081185, + "flos": 573664882176.0, + "grad_norm": 0.0720637583069195, + "language_loss": 0.83487344, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.84559393, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.14831543, + "step": 4451, + "time_per_iteration": 2.7148618698120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068368, + "balance_loss_mlp": 1.05413437, + "epoch": 0.856483262793382, + "flos": 781754872320.0, + "grad_norm": 0.08319979714541847, + "language_loss": 0.80538082, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81606448, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.14233398, + "step": 4452, + "time_per_iteration": 3.1150898933410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070338, + "balance_loss_mlp": 1.05599678, + "epoch": 0.8566756444786456, + "flos": 455819083776.0, + "grad_norm": 0.06133419120711316, + "language_loss": 0.84648192, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85718524, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.14343262, + "step": 4453, + "time_per_iteration": 2.5603737831115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067449, + "balance_loss_mlp": 1.05266762, + "epoch": 0.8568680261639092, + "flos": 449382587904.0, + "grad_norm": 0.09315038231056039, + "language_loss": 0.84648412, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85715866, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.14782715, + "step": 4454, + "time_per_iteration": 2.5867726802825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074372, + "balance_loss_mlp": 1.05986428, + "epoch": 0.8570604078491728, + "flos": 479976316416.0, + "grad_norm": 0.07700145526385223, + "language_loss": 0.82769418, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83843791, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.14489746, + "step": 4455, + "time_per_iteration": 2.5525221824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106982, + "balance_loss_mlp": 1.0550859, + "epoch": 0.8572527895344363, + "flos": 505942184448.0, + "grad_norm": 0.06363308666132952, + "language_loss": 0.84937072, + "learning_rate": 5.249189615562627e-05, + "loss": 0.86006892, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.1472168, + "step": 4456, + "time_per_iteration": 2.576906681060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069538, + "balance_loss_mlp": 1.05516171, + "epoch": 0.8574451712196999, + "flos": 787044957696.0, + "grad_norm": 0.0582073915457821, + "language_loss": 0.82954866, + "learning_rate": 5.235302469011905e-05, + "loss": 0.840244, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.14379883, + "step": 4457, + "time_per_iteration": 3.0546817779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062408, + "balance_loss_mlp": 1.04806721, + "epoch": 0.8576375529049635, + "flos": 509252935680.0, + "grad_norm": 0.06955438726938921, + "language_loss": 0.75106084, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76168495, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.14318848, + "step": 4458, + "time_per_iteration": 2.6937506198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011775, + "balance_loss_mlp": 1.00548077, + "epoch": 0.857829934590227, + "flos": 1460772486144.0, + "grad_norm": 0.008113169316068945, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85779065, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.06298828, + "step": 4459, + "time_per_iteration": 4.911555528640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068424, + "balance_loss_mlp": 1.05389237, + "epoch": 0.8580223162754905, + "flos": 479296839168.0, + "grad_norm": 0.145833654040799, + "language_loss": 0.89347082, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90415508, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.1451416, + "step": 4460, + "time_per_iteration": 2.645474433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069776, + "balance_loss_mlp": 1.05543506, + "epoch": 0.8582146979607541, + "flos": 706231954944.0, + "grad_norm": 0.08421529829088402, + "language_loss": 0.79048121, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80117893, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.14331055, + "step": 4461, + "time_per_iteration": 2.8346517086029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071468, + "balance_loss_mlp": 1.05721021, + "epoch": 0.8584070796460177, + "flos": 765158524416.0, + "grad_norm": 0.08957550306757553, + "language_loss": 0.82738662, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.83810127, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.14245605, + "step": 4462, + "time_per_iteration": 3.047076463699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069247, + "balance_loss_mlp": 1.05483508, + "epoch": 0.8585994613312813, + "flos": 586829339136.0, + "grad_norm": 0.0707996237534643, + "language_loss": 0.85821873, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86891121, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.14404297, + "step": 4463, + "time_per_iteration": 2.789858102798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066518, + "balance_loss_mlp": 1.052356, + "epoch": 0.8587918430165449, + "flos": 608295826944.0, + "grad_norm": 0.08127144245962697, + "language_loss": 0.78870726, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79937249, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.14147949, + "step": 4464, + "time_per_iteration": 2.7969038486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067345, + "balance_loss_mlp": 1.05314672, + "epoch": 0.8589842247018084, + "flos": 588981570048.0, + "grad_norm": 0.07472876002121234, + "language_loss": 0.80512178, + "learning_rate": 5.124831399159535e-05, + "loss": 0.81579524, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.14196777, + "step": 4465, + "time_per_iteration": 2.736020565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074093, + "balance_loss_mlp": 1.05929875, + "epoch": 0.8591766063870719, + "flos": 543879111168.0, + "grad_norm": 0.11520064684359647, + "language_loss": 0.78347111, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.79421198, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.14758301, + "step": 4466, + "time_per_iteration": 2.7088613510131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072551, + "balance_loss_mlp": 1.05799568, + "epoch": 0.8593689880723355, + "flos": 493756010496.0, + "grad_norm": 0.07199823899248142, + "language_loss": 0.80669403, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.81741953, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.14526367, + "step": 4467, + "time_per_iteration": 2.751774311065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074751, + "balance_loss_mlp": 1.06001639, + "epoch": 0.8595613697575991, + "flos": 533909408256.0, + "grad_norm": 0.07801691002975698, + "language_loss": 0.83068347, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84143102, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.1472168, + "step": 4468, + "time_per_iteration": 2.6254448890686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070926, + "balance_loss_mlp": 1.05635858, + "epoch": 0.8597537514428626, + "flos": 617628271104.0, + "grad_norm": 0.07457537179448775, + "language_loss": 0.76102448, + "learning_rate": 5.070013822961328e-05, + "loss": 0.77173376, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.14562988, + "step": 4469, + "time_per_iteration": 2.78564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106627, + "balance_loss_mlp": 1.05185747, + "epoch": 0.8599461331281262, + "flos": 608730826752.0, + "grad_norm": 0.07387770607990847, + "language_loss": 0.83740634, + "learning_rate": 5.056353024046462e-05, + "loss": 0.84806907, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.14416504, + "step": 4470, + "time_per_iteration": 2.7199819087982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073459, + "balance_loss_mlp": 1.05884385, + "epoch": 0.8601385148133898, + "flos": 551252044800.0, + "grad_norm": 0.07776930298197288, + "language_loss": 0.83086514, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84159976, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.14599609, + "step": 4471, + "time_per_iteration": 2.655369281768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106711, + "balance_loss_mlp": 1.05244768, + "epoch": 0.8603308964986534, + "flos": 581200800768.0, + "grad_norm": 0.05601587567115835, + "language_loss": 0.80901635, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.81968743, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.14648438, + "step": 4472, + "time_per_iteration": 2.8570289611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073921, + "balance_loss_mlp": 1.05958033, + "epoch": 0.8605232781839169, + "flos": 629013828096.0, + "grad_norm": 0.0851895281729739, + "language_loss": 0.7508207, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76155984, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.14331055, + "step": 4473, + "time_per_iteration": 2.7473347187042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066806, + "balance_loss_mlp": 1.05288196, + "epoch": 0.8607156598691804, + "flos": 468141078528.0, + "grad_norm": 0.0733266349612676, + "language_loss": 0.76999867, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78066671, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.13928223, + "step": 4474, + "time_per_iteration": 2.511343002319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069976, + "balance_loss_mlp": 1.0554204, + "epoch": 0.860908041554444, + "flos": 488394344448.0, + "grad_norm": 0.06480096420670076, + "language_loss": 0.82572103, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83642077, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.14550781, + "step": 4475, + "time_per_iteration": 2.6399173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066208, + "balance_loss_mlp": 1.05160475, + "epoch": 0.8611004232397076, + "flos": 592094831616.0, + "grad_norm": 0.08039350372940637, + "language_loss": 0.80106586, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81172794, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.14575195, + "step": 4476, + "time_per_iteration": 2.7327587604522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067278, + "balance_loss_mlp": 1.05268657, + "epoch": 0.8612928049249712, + "flos": 774209041920.0, + "grad_norm": 0.08404476635777386, + "language_loss": 0.86105013, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.87172294, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.14587402, + "step": 4477, + "time_per_iteration": 3.0373780727386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070558, + "balance_loss_mlp": 1.0560379, + "epoch": 0.8614851866102347, + "flos": 537553843200.0, + "grad_norm": 0.07409303863444187, + "language_loss": 0.82399005, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83469558, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.14501953, + "step": 4478, + "time_per_iteration": 2.6591262817382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065378, + "balance_loss_mlp": 1.05058384, + "epoch": 0.8616775682954982, + "flos": 565916419584.0, + "grad_norm": 0.0631568750529317, + "language_loss": 0.78993368, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80058742, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.14758301, + "step": 4479, + "time_per_iteration": 2.658097267150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064115, + "balance_loss_mlp": 1.04950047, + "epoch": 0.8618699499807618, + "flos": 481592231424.0, + "grad_norm": 0.08056621333119694, + "language_loss": 0.81684464, + "learning_rate": 4.92070558355221e-05, + "loss": 0.8274858, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.14599609, + "step": 4480, + "time_per_iteration": 2.740461826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065788, + "balance_loss_mlp": 1.05064893, + "epoch": 0.8620623316660254, + "flos": 649506802176.0, + "grad_norm": 0.09178637481002815, + "language_loss": 0.7409358, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75159371, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.15124512, + "step": 4481, + "time_per_iteration": 2.8202409744262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070778, + "balance_loss_mlp": 1.05633044, + "epoch": 0.862254713351289, + "flos": 751781523456.0, + "grad_norm": 0.07336978506416574, + "language_loss": 0.85627228, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86698008, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.14428711, + "step": 4482, + "time_per_iteration": 2.9723026752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072478, + "balance_loss_mlp": 1.05771959, + "epoch": 0.8624470950365525, + "flos": 841543727616.0, + "grad_norm": 0.06427731204985579, + "language_loss": 0.77644771, + "learning_rate": 4.880352388488024e-05, + "loss": 0.7871725, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.14746094, + "step": 4483, + "time_per_iteration": 3.2497451305389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072184, + "balance_loss_mlp": 1.05741429, + "epoch": 0.8626394767218161, + "flos": 754793468928.0, + "grad_norm": 0.0734090196676215, + "language_loss": 0.83015764, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84087956, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.14746094, + "step": 4484, + "time_per_iteration": 2.8956780433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075352, + "balance_loss_mlp": 1.06092763, + "epoch": 0.8628318584070797, + "flos": 703585626624.0, + "grad_norm": 0.06806275994397937, + "language_loss": 0.82180882, + "learning_rate": 4.853537834745203e-05, + "loss": 0.83256233, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.14404297, + "step": 4485, + "time_per_iteration": 2.9138083457946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066713, + "balance_loss_mlp": 1.05188346, + "epoch": 0.8630242400923432, + "flos": 471244428288.0, + "grad_norm": 0.06130669140351844, + "language_loss": 0.77192688, + "learning_rate": 4.840156846389487e-05, + "loss": 0.78259403, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.14807129, + "step": 4486, + "time_per_iteration": 2.5923945903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068939, + "balance_loss_mlp": 1.05420458, + "epoch": 0.8632166217776067, + "flos": 964363553280.0, + "grad_norm": 0.09142848805617776, + "language_loss": 0.77645731, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78714675, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.14697266, + "step": 4487, + "time_per_iteration": 3.2063825130462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066737, + "balance_loss_mlp": 1.05182362, + "epoch": 0.8634090034628703, + "flos": 767913509376.0, + "grad_norm": 0.07487665113796628, + "language_loss": 0.78699821, + "learning_rate": 4.813447472684246e-05, + "loss": 0.7976656, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.14880371, + "step": 4488, + "time_per_iteration": 3.005026340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069915, + "balance_loss_mlp": 1.05519223, + "epoch": 0.8636013851481339, + "flos": 520591504896.0, + "grad_norm": 0.07136180617558878, + "language_loss": 0.8320052, + "learning_rate": 4.800119097704214e-05, + "loss": 0.8427043, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.14697266, + "step": 4489, + "time_per_iteration": 2.7364392280578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067253, + "balance_loss_mlp": 1.05310261, + "epoch": 0.8637937668333975, + "flos": 632144342016.0, + "grad_norm": 0.08078555791149708, + "language_loss": 0.80594444, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81661701, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.14135742, + "step": 4490, + "time_per_iteration": 2.7436652183532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067307, + "balance_loss_mlp": 1.05278766, + "epoch": 0.8639861485186611, + "flos": 856094676480.0, + "grad_norm": 0.12060339505019638, + "language_loss": 0.76427901, + "learning_rate": 4.773514997362e-05, + "loss": 0.77495205, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.1451416, + "step": 4491, + "time_per_iteration": 3.0809972286224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_mlp": 1.05887485, + "epoch": 0.8641785302039245, + "flos": 481261118976.0, + "grad_norm": 0.07217644501774635, + "language_loss": 0.77776736, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.78849971, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.14355469, + "step": 4492, + "time_per_iteration": 2.5654242038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068525, + "balance_loss_mlp": 1.05417252, + "epoch": 0.8643709118891881, + "flos": 504637558272.0, + "grad_norm": 0.06656380617407046, + "language_loss": 0.80193943, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81262463, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.14355469, + "step": 4493, + "time_per_iteration": 2.670027017593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069512, + "balance_loss_mlp": 1.0553143, + "epoch": 0.8645632935744517, + "flos": 552368719872.0, + "grad_norm": 0.07682965600058904, + "language_loss": 0.82227212, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83296728, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.1418457, + "step": 4494, + "time_per_iteration": 2.8134214878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065325, + "balance_loss_mlp": 1.05059028, + "epoch": 0.8647556752597153, + "flos": 524737751040.0, + "grad_norm": 0.07423694225628534, + "language_loss": 0.83722866, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.84788191, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.14709473, + "step": 4495, + "time_per_iteration": 2.6327974796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073184, + "balance_loss_mlp": 1.058617, + "epoch": 0.8649480569449788, + "flos": 787768851456.0, + "grad_norm": 0.07327759131126368, + "language_loss": 0.82331359, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83404541, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.14550781, + "step": 4496, + "time_per_iteration": 3.0912046432495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069634, + "balance_loss_mlp": 1.05510235, + "epoch": 0.8651404386302424, + "flos": 763863810048.0, + "grad_norm": 0.07168527754469435, + "language_loss": 0.76572919, + "learning_rate": 4.694124264495225e-05, + "loss": 0.77642548, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.14526367, + "step": 4497, + "time_per_iteration": 3.043983221054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067126, + "balance_loss_mlp": 1.05242729, + "epoch": 0.865332820315506, + "flos": 539893651968.0, + "grad_norm": 0.06672148584228833, + "language_loss": 0.82233298, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83300424, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.14685059, + "step": 4498, + "time_per_iteration": 2.719404697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009012, + "balance_loss_mlp": 1.0026226, + "epoch": 0.8655252020007695, + "flos": 1476632830464.0, + "grad_norm": 0.004886545059894445, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80183458, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.06396484, + "step": 4499, + "time_per_iteration": 4.798980474472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062842, + "balance_loss_mlp": 1.0486083, + "epoch": 0.8657175836860331, + "flos": 517369586688.0, + "grad_norm": 0.08270654530250093, + "language_loss": 0.82950461, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.84013307, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.14233398, + "step": 4500, + "time_per_iteration": 2.731417179107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106723, + "balance_loss_mlp": 1.05263877, + "epoch": 0.8659099653712966, + "flos": 590523333120.0, + "grad_norm": 0.07191381207287514, + "language_loss": 0.80231231, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81298465, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.14575195, + "step": 4501, + "time_per_iteration": 2.697899341583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062306, + "balance_loss_mlp": 1.04800117, + "epoch": 0.8661023470565602, + "flos": 590449181184.0, + "grad_norm": 0.05594849429502133, + "language_loss": 0.87944901, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89007205, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.14282227, + "step": 4502, + "time_per_iteration": 2.8466720581054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064504, + "balance_loss_mlp": 1.04962659, + "epoch": 0.8662947287418238, + "flos": 567670726656.0, + "grad_norm": 0.06639072474575029, + "language_loss": 0.79237312, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80301815, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.1484375, + "step": 4503, + "time_per_iteration": 2.7786972522735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072513, + "balance_loss_mlp": 1.05816054, + "epoch": 0.8664871104270874, + "flos": 515929139712.0, + "grad_norm": 0.05596301898353544, + "language_loss": 0.82147396, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.8321991, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.14355469, + "step": 4504, + "time_per_iteration": 2.7921864986419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075701, + "balance_loss_mlp": 1.06135976, + "epoch": 0.866679492112351, + "flos": 557263452672.0, + "grad_norm": 0.07445535583019337, + "language_loss": 0.78300965, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79376662, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.14343262, + "step": 4505, + "time_per_iteration": 2.857663154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062777, + "balance_loss_mlp": 1.04850721, + "epoch": 0.8668718737976144, + "flos": 722448004608.0, + "grad_norm": 0.09695588327061085, + "language_loss": 0.81800681, + "learning_rate": 4.57622578599054e-05, + "loss": 0.82863462, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.14257812, + "step": 4506, + "time_per_iteration": 2.929633855819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065238, + "balance_loss_mlp": 1.050861, + "epoch": 0.867064255482878, + "flos": 600705580032.0, + "grad_norm": 0.07502570041453936, + "language_loss": 0.84632653, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.85697895, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.14367676, + "step": 4507, + "time_per_iteration": 2.7329187393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068231, + "balance_loss_mlp": 1.05338943, + "epoch": 0.8672566371681416, + "flos": 803527879680.0, + "grad_norm": 0.07513188405638076, + "language_loss": 0.76312721, + "learning_rate": 4.550219979745529e-05, + "loss": 0.77380955, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.14831543, + "step": 4508, + "time_per_iteration": 3.0379912853240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064243, + "balance_loss_mlp": 1.04998589, + "epoch": 0.8674490188534052, + "flos": 627368177664.0, + "grad_norm": 0.061997847025714266, + "language_loss": 0.83527964, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.84592211, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.14257812, + "step": 4509, + "time_per_iteration": 2.7606923580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071433, + "balance_loss_mlp": 1.05680609, + "epoch": 0.8676414005386687, + "flos": 727831692288.0, + "grad_norm": 0.06323363214772874, + "language_loss": 0.86261082, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87332517, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.14624023, + "step": 4510, + "time_per_iteration": 2.975365161895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106234, + "balance_loss_mlp": 1.04809463, + "epoch": 0.8678337822239323, + "flos": 539972573184.0, + "grad_norm": 0.08527155425852233, + "language_loss": 0.80449998, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.81512344, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.14257812, + "step": 4511, + "time_per_iteration": 2.7997350692749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067767, + "balance_loss_mlp": 1.05356872, + "epoch": 0.8680261639091958, + "flos": 507521023488.0, + "grad_norm": 0.13843018607601695, + "language_loss": 0.79428059, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80495822, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.14196777, + "step": 4512, + "time_per_iteration": 2.6306517124176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066222, + "balance_loss_mlp": 1.05204797, + "epoch": 0.8682185455944594, + "flos": 487126794240.0, + "grad_norm": 0.0652028208920273, + "language_loss": 0.80685651, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.81751871, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.14160156, + "step": 4513, + "time_per_iteration": 2.631469488143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063365, + "balance_loss_mlp": 1.04905963, + "epoch": 0.868410927279723, + "flos": 603690361344.0, + "grad_norm": 0.08387419334599636, + "language_loss": 0.80628252, + "learning_rate": 4.472626206030528e-05, + "loss": 0.81691617, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.14306641, + "step": 4514, + "time_per_iteration": 2.703423500061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061634, + "balance_loss_mlp": 1.04734087, + "epoch": 0.8686033089649865, + "flos": 1118985186816.0, + "grad_norm": 0.09897046417963085, + "language_loss": 0.84731203, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.8579284, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.14294434, + "step": 4515, + "time_per_iteration": 3.3720173835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066888, + "balance_loss_mlp": 1.05241561, + "epoch": 0.8687956906502501, + "flos": 568019091456.0, + "grad_norm": 0.0907599826984789, + "language_loss": 0.83635509, + "learning_rate": 4.446902963685862e-05, + "loss": 0.84702396, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.14477539, + "step": 4516, + "time_per_iteration": 2.661489248275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065423, + "balance_loss_mlp": 1.05126095, + "epoch": 0.8689880723355137, + "flos": 544338703872.0, + "grad_norm": 0.07393998563485746, + "language_loss": 0.84213966, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.85279387, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.14147949, + "step": 4517, + "time_per_iteration": 2.6653032302856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060388, + "balance_loss_mlp": 1.0461185, + "epoch": 0.8691804540207773, + "flos": 457425086976.0, + "grad_norm": 0.06604754352210267, + "language_loss": 0.86236376, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.8729676, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.14257812, + "step": 4518, + "time_per_iteration": 2.6518642902374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065916, + "balance_loss_mlp": 1.05138481, + "epoch": 0.8693728357060407, + "flos": 591872375808.0, + "grad_norm": 0.07030672265979203, + "language_loss": 0.80032271, + "learning_rate": 4.40845075221456e-05, + "loss": 0.81098187, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.1451416, + "step": 4519, + "time_per_iteration": 2.747318983078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061664, + "balance_loss_mlp": 1.04732347, + "epoch": 0.8695652173913043, + "flos": 680263515648.0, + "grad_norm": 0.08647711419457829, + "language_loss": 0.7937988, + "learning_rate": 4.395668742181164e-05, + "loss": 0.80441546, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.14318848, + "step": 4520, + "time_per_iteration": 2.8835909366607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.05213761, + "epoch": 0.8697575990765679, + "flos": 492362551296.0, + "grad_norm": 0.0756040162570651, + "language_loss": 0.78086627, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79153037, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.14257812, + "step": 4521, + "time_per_iteration": 2.5724833011627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066494, + "balance_loss_mlp": 1.05209351, + "epoch": 0.8699499807618315, + "flos": 526949079552.0, + "grad_norm": 0.062480964319909835, + "language_loss": 0.81658232, + "learning_rate": 4.370157842584671e-05, + "loss": 0.82724726, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.14404297, + "step": 4522, + "time_per_iteration": 2.6957974433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065943, + "balance_loss_mlp": 1.05160189, + "epoch": 0.8701423624470951, + "flos": 814342616064.0, + "grad_norm": 0.06768287451120002, + "language_loss": 0.80298173, + "learning_rate": 4.357428962925808e-05, + "loss": 0.81364119, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.14331055, + "step": 4523, + "time_per_iteration": 3.1663365364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064286, + "balance_loss_mlp": 1.04987335, + "epoch": 0.8703347441323586, + "flos": 556789178880.0, + "grad_norm": 0.06671589316380268, + "language_loss": 0.88140607, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89204895, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.14416504, + "step": 4524, + "time_per_iteration": 2.6627633571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064956, + "balance_loss_mlp": 1.04983997, + "epoch": 0.8705271258176221, + "flos": 585443220480.0, + "grad_norm": 0.06181301750116614, + "language_loss": 0.84106493, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85171449, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.15100098, + "step": 4525, + "time_per_iteration": 2.813011646270752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106471, + "balance_loss_mlp": 1.04989266, + "epoch": 0.8707195075028857, + "flos": 669216411648.0, + "grad_norm": 0.06605227762602037, + "language_loss": 0.8533206, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86396778, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.14794922, + "step": 4526, + "time_per_iteration": 2.8933169841766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065001, + "balance_loss_mlp": 1.05061281, + "epoch": 0.8709118891881493, + "flos": 520391443968.0, + "grad_norm": 0.060370068078767804, + "language_loss": 0.83663857, + "learning_rate": 4.306690693781007e-05, + "loss": 0.84728855, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.1439209, + "step": 4527, + "time_per_iteration": 2.761434555053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064419, + "balance_loss_mlp": 1.04992294, + "epoch": 0.8711042708734128, + "flos": 553208984064.0, + "grad_norm": 0.08414030206759188, + "language_loss": 0.81535316, + "learning_rate": 4.294050463490401e-05, + "loss": 0.82599723, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.14489746, + "step": 4528, + "time_per_iteration": 2.650632619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_mlp": 1.04762089, + "epoch": 0.8712966525586764, + "flos": 502193862144.0, + "grad_norm": 0.09478165614322998, + "language_loss": 0.81905985, + "learning_rate": 4.281427977823094e-05, + "loss": 0.82968199, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.14587402, + "step": 4529, + "time_per_iteration": 2.7222495079040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106655, + "balance_loss_mlp": 1.05204225, + "epoch": 0.87148903424394, + "flos": 804096129024.0, + "grad_norm": 0.09748177574761158, + "language_loss": 0.73896039, + "learning_rate": 4.268823241679593e-05, + "loss": 0.74962586, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.14489746, + "step": 4530, + "time_per_iteration": 3.050337791442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065647, + "balance_loss_mlp": 1.05079401, + "epoch": 0.8716814159292036, + "flos": 773438160384.0, + "grad_norm": 0.0689062748020189, + "language_loss": 0.86388242, + "learning_rate": 4.256236259953489e-05, + "loss": 0.8745389, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.14831543, + "step": 4531, + "time_per_iteration": 3.0478785037994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065486, + "balance_loss_mlp": 1.05087137, + "epoch": 0.8718737976144671, + "flos": 486835329024.0, + "grad_norm": 0.08577279593283388, + "language_loss": 0.85180438, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86245918, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.14599609, + "step": 4532, + "time_per_iteration": 2.6602768898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059568, + "balance_loss_mlp": 1.04522741, + "epoch": 0.8720661792997306, + "flos": 584123913216.0, + "grad_norm": 0.0657296857240319, + "language_loss": 0.78559881, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.79619455, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.14318848, + "step": 4533, + "time_per_iteration": 2.733445644378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007563, + "balance_loss_mlp": 1.00112557, + "epoch": 0.8722585609849942, + "flos": 1495942318080.0, + "grad_norm": 0.005568796329920205, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81974363, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.06445312, + "step": 4534, + "time_per_iteration": 4.842711925506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063185, + "balance_loss_mlp": 1.048558, + "epoch": 0.8724509426702578, + "flos": 596169123840.0, + "grad_norm": 0.06814746037567286, + "language_loss": 0.87232822, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88296002, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.14611816, + "step": 4535, + "time_per_iteration": 2.749300479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010645, + "balance_loss_mlp": 1.04971766, + "epoch": 0.8726433243555214, + "flos": 443635481088.0, + "grad_norm": 0.0820490695559427, + "language_loss": 0.80679154, + "learning_rate": 4.193567838376888e-05, + "loss": 0.81743658, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.14758301, + "step": 4536, + "time_per_iteration": 2.553683042526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059539, + "balance_loss_mlp": 1.04492414, + "epoch": 0.8728357060407849, + "flos": 553181819904.0, + "grad_norm": 0.08604953628210836, + "language_loss": 0.81798059, + "learning_rate": 4.181087485534402e-05, + "loss": 0.82857597, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.14611816, + "step": 4537, + "time_per_iteration": 2.6546003818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063406, + "balance_loss_mlp": 1.04877949, + "epoch": 0.8730280877260485, + "flos": 627807946752.0, + "grad_norm": 0.08278290011227846, + "language_loss": 0.78786474, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79849875, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.14611816, + "step": 4538, + "time_per_iteration": 2.8178372383117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060174, + "balance_loss_mlp": 1.04570246, + "epoch": 0.873220469411312, + "flos": 535384359936.0, + "grad_norm": 0.06689995736603449, + "language_loss": 0.8018595, + "learning_rate": 4.156180150126143e-05, + "loss": 0.8124612, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.14465332, + "step": 4539, + "time_per_iteration": 2.743286371231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069578, + "balance_loss_mlp": 1.05512953, + "epoch": 0.8734128510965756, + "flos": 561883972608.0, + "grad_norm": 0.08737524822490801, + "language_loss": 0.8396098, + "learning_rate": 4.143753177230242e-05, + "loss": 0.85030556, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.14453125, + "step": 4540, + "time_per_iteration": 2.707806348800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061883, + "balance_loss_mlp": 1.04744649, + "epoch": 0.8736052327818392, + "flos": 686467643904.0, + "grad_norm": 0.06680973227686807, + "language_loss": 0.79487395, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80549276, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.14416504, + "step": 4541, + "time_per_iteration": 2.9801111221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060786, + "balance_loss_mlp": 1.04626584, + "epoch": 0.8737976144671027, + "flos": 531673113600.0, + "grad_norm": 0.07234482564699127, + "language_loss": 0.81535935, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82596719, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.14501953, + "step": 4542, + "time_per_iteration": 2.8178179264068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062393, + "balance_loss_mlp": 1.047647, + "epoch": 0.8739899961523663, + "flos": 575592086016.0, + "grad_norm": 0.06822044709345593, + "language_loss": 0.81856036, + "learning_rate": 4.106579095649649e-05, + "loss": 0.82918429, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.14733887, + "step": 4543, + "time_per_iteration": 2.8611669540405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059548, + "balance_loss_mlp": 1.04505205, + "epoch": 0.8741823778376299, + "flos": 731332965888.0, + "grad_norm": 0.08490003911164679, + "language_loss": 0.76622522, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77682072, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.14489746, + "step": 4544, + "time_per_iteration": 2.9649460315704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063692, + "balance_loss_mlp": 1.04885018, + "epoch": 0.8743747595228935, + "flos": 567080082432.0, + "grad_norm": 0.08047160087313358, + "language_loss": 0.83460504, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84524196, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.14819336, + "step": 4545, + "time_per_iteration": 2.759756088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064067, + "balance_loss_mlp": 1.04918993, + "epoch": 0.8745671412081569, + "flos": 493370569728.0, + "grad_norm": 0.06466903860964004, + "language_loss": 0.8239516, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83459222, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.14855957, + "step": 4546, + "time_per_iteration": 2.5922882556915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063052, + "balance_loss_mlp": 1.0481863, + "epoch": 0.8747595228934205, + "flos": 524139766272.0, + "grad_norm": 0.06777304384321896, + "language_loss": 0.83297229, + "learning_rate": 4.057263119533233e-05, + "loss": 0.84360284, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.14831543, + "step": 4547, + "time_per_iteration": 2.626225233078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062146, + "balance_loss_mlp": 1.0476141, + "epoch": 0.8749519045786841, + "flos": 744349118976.0, + "grad_norm": 0.07832002920068278, + "language_loss": 0.79854083, + "learning_rate": 4.044978704935853e-05, + "loss": 0.80916226, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.1451416, + "step": 4548, + "time_per_iteration": 3.0136497020721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064319, + "balance_loss_mlp": 1.04978716, + "epoch": 0.8751442862639477, + "flos": 594278995968.0, + "grad_norm": 0.0648484377907723, + "language_loss": 0.79846859, + "learning_rate": 4.032712131660027e-05, + "loss": 0.80911177, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.14538574, + "step": 4549, + "time_per_iteration": 2.8334498405456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062635, + "balance_loss_mlp": 1.04817486, + "epoch": 0.8753366679492113, + "flos": 496530819072.0, + "grad_norm": 0.06635734878737051, + "language_loss": 0.7858516, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79647791, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.14453125, + "step": 4550, + "time_per_iteration": 2.738966941833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.0483439, + "epoch": 0.8755290496344748, + "flos": 489864526848.0, + "grad_norm": 0.0802538221579537, + "language_loss": 0.8152554, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.82588565, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.14685059, + "step": 4551, + "time_per_iteration": 2.56887149810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060736, + "balance_loss_mlp": 1.04596615, + "epoch": 0.8757214313197383, + "flos": 591859892736.0, + "grad_norm": 0.06834289789386311, + "language_loss": 0.81667864, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82728601, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.14746094, + "step": 4552, + "time_per_iteration": 2.794102668762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062466, + "balance_loss_mlp": 1.04806566, + "epoch": 0.8759138130050019, + "flos": 976843763712.0, + "grad_norm": 0.07625032965138905, + "language_loss": 0.77863795, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.78926265, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.1439209, + "step": 4553, + "time_per_iteration": 3.2093098163604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063858, + "balance_loss_mlp": 1.04906428, + "epoch": 0.8761061946902655, + "flos": 802764338688.0, + "grad_norm": 0.062390940172138094, + "language_loss": 0.77533054, + "learning_rate": 3.971647051542243e-05, + "loss": 0.78596914, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.14770508, + "step": 4554, + "time_per_iteration": 3.070384979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106235, + "balance_loss_mlp": 1.04777074, + "epoch": 0.8762985763755291, + "flos": 698495602176.0, + "grad_norm": 0.06693574934874094, + "language_loss": 0.74468589, + "learning_rate": 3.95948762596155e-05, + "loss": 0.7553094, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.14562988, + "step": 4555, + "time_per_iteration": 2.9657835960388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061359, + "balance_loss_mlp": 1.04670799, + "epoch": 0.8764909580607926, + "flos": 629717898240.0, + "grad_norm": 0.07988560092503469, + "language_loss": 0.80092323, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81153679, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.14648438, + "step": 4556, + "time_per_iteration": 2.8684329986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064834, + "balance_loss_mlp": 1.050565, + "epoch": 0.8766833397460562, + "flos": 481545243648.0, + "grad_norm": 0.08423746847970588, + "language_loss": 0.80034041, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81098878, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.1427002, + "step": 4557, + "time_per_iteration": 2.7271201610565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067348, + "balance_loss_mlp": 1.05256641, + "epoch": 0.8768757214313198, + "flos": 407734414848.0, + "grad_norm": 0.07266214938945337, + "language_loss": 0.78330112, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79397452, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.14758301, + "step": 4558, + "time_per_iteration": 2.534062147140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064084, + "balance_loss_mlp": 1.04936194, + "epoch": 0.8770681031165833, + "flos": 582582150144.0, + "grad_norm": 0.07558368157454017, + "language_loss": 0.81913722, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.82977808, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.14697266, + "step": 4559, + "time_per_iteration": 2.6745707988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065429, + "balance_loss_mlp": 1.05061114, + "epoch": 0.8772604848018468, + "flos": 508687257600.0, + "grad_norm": 0.08298744774134015, + "language_loss": 0.80581164, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.81646591, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.14794922, + "step": 4560, + "time_per_iteration": 2.6083872318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066798, + "balance_loss_mlp": 1.05203962, + "epoch": 0.8774528664871104, + "flos": 408836408832.0, + "grad_norm": 0.08583814592786851, + "language_loss": 0.85218108, + "learning_rate": 3.886906601970913e-05, + "loss": 0.86284906, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.14733887, + "step": 4561, + "time_per_iteration": 2.453648805618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067415, + "balance_loss_mlp": 1.05254984, + "epoch": 0.877645248172374, + "flos": 500844819456.0, + "grad_norm": 0.06593803306167176, + "language_loss": 0.83422303, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84489715, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.14855957, + "step": 4562, + "time_per_iteration": 2.6662542819976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063646, + "balance_loss_mlp": 1.04856586, + "epoch": 0.8778376298576376, + "flos": 633145019904.0, + "grad_norm": 0.07101645865230781, + "language_loss": 0.77801663, + "learning_rate": 3.862856098834189e-05, + "loss": 0.78865308, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.1505127, + "step": 4563, + "time_per_iteration": 2.8906450271606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070539, + "balance_loss_mlp": 1.05560255, + "epoch": 0.8780300115429012, + "flos": 533988329472.0, + "grad_norm": 0.07397015685289171, + "language_loss": 0.8016603, + "learning_rate": 3.850857712974976e-05, + "loss": 0.81236565, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.14916992, + "step": 4564, + "time_per_iteration": 2.8398656845092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066243, + "balance_loss_mlp": 1.05191386, + "epoch": 0.8782223932281646, + "flos": 511662127104.0, + "grad_norm": 0.06215610141963286, + "language_loss": 0.77037019, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78103256, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.14331055, + "step": 4565, + "time_per_iteration": 2.606433153152466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064056, + "balance_loss_mlp": 1.04955995, + "epoch": 0.8784147749134282, + "flos": 780714547200.0, + "grad_norm": 0.08789317923638273, + "language_loss": 0.69927686, + "learning_rate": 3.826914695965766e-05, + "loss": 0.70991743, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.14489746, + "step": 4566, + "time_per_iteration": 3.193535804748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067784, + "balance_loss_mlp": 1.05303764, + "epoch": 0.8786071565986918, + "flos": 561004434432.0, + "grad_norm": 0.10908406210790224, + "language_loss": 0.75545955, + "learning_rate": 3.814970074111279e-05, + "loss": 0.76613748, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.1472168, + "step": 4567, + "time_per_iteration": 2.7053375244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063765, + "balance_loss_mlp": 1.04924548, + "epoch": 0.8787995382839554, + "flos": 603448081920.0, + "grad_norm": 0.06509274087171016, + "language_loss": 0.77338004, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78401768, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.14501953, + "step": 4568, + "time_per_iteration": 2.823720693588257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067338, + "balance_loss_mlp": 1.05299711, + "epoch": 0.8789919199692189, + "flos": 560233552896.0, + "grad_norm": 0.06476749948002929, + "language_loss": 0.85155976, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.86223316, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.14355469, + "step": 4569, + "time_per_iteration": 2.69594407081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063613, + "balance_loss_mlp": 1.04903364, + "epoch": 0.8791843016544825, + "flos": 539115429888.0, + "grad_norm": 0.09405373492006784, + "language_loss": 0.81978583, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.83042198, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.14575195, + "step": 4570, + "time_per_iteration": 2.7766315937042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066864, + "balance_loss_mlp": 1.05215406, + "epoch": 0.8793766833397461, + "flos": 1008699899904.0, + "grad_norm": 0.06533116595538893, + "language_loss": 0.79086006, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80152869, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.14709473, + "step": 4571, + "time_per_iteration": 3.391723871231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.04999161, + "epoch": 0.8795690650250096, + "flos": 678637688832.0, + "grad_norm": 0.06515918314905815, + "language_loss": 0.81039464, + "learning_rate": 3.755516016623628e-05, + "loss": 0.82103866, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.14404297, + "step": 4572, + "time_per_iteration": 2.877964496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065039, + "balance_loss_mlp": 1.05043602, + "epoch": 0.8797614467102732, + "flos": 453432287232.0, + "grad_norm": 0.07838900846740328, + "language_loss": 0.88639665, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.8970471, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.14575195, + "step": 4573, + "time_per_iteration": 2.562926769256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062634, + "balance_loss_mlp": 1.0480783, + "epoch": 0.8799538283955367, + "flos": 550913591808.0, + "grad_norm": 0.06634304029009142, + "language_loss": 0.84095144, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.85157776, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.14538574, + "step": 4574, + "time_per_iteration": 2.6689820289611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068241, + "balance_loss_mlp": 1.05397153, + "epoch": 0.8801462100808003, + "flos": 807429275136.0, + "grad_norm": 0.08408376547428717, + "language_loss": 0.84203458, + "learning_rate": 3.720058989624681e-05, + "loss": 0.85271698, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.1427002, + "step": 4575, + "time_per_iteration": 3.06958270072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069659, + "balance_loss_mlp": 1.05469871, + "epoch": 0.8803385917660639, + "flos": 768694302720.0, + "grad_norm": 0.06560709355959533, + "language_loss": 0.84476829, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85546494, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.14941406, + "step": 4576, + "time_per_iteration": 2.9229040145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067971, + "balance_loss_mlp": 1.05327201, + "epoch": 0.8805309734513275, + "flos": 567339614208.0, + "grad_norm": 0.06356861295382751, + "language_loss": 0.81037927, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82105893, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.14685059, + "step": 4577, + "time_per_iteration": 2.735290765762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068874, + "balance_loss_mlp": 1.05421138, + "epoch": 0.880723355136591, + "flos": 679779330048.0, + "grad_norm": 0.07286316970096472, + "language_loss": 0.81711239, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82780111, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.1463623, + "step": 4578, + "time_per_iteration": 2.812211275100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065505, + "balance_loss_mlp": 1.05084252, + "epoch": 0.8809157368218545, + "flos": 565629723648.0, + "grad_norm": 0.06978735533151822, + "language_loss": 0.79132414, + "learning_rate": 3.673034519424734e-05, + "loss": 0.80197918, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.1463623, + "step": 4579, + "time_per_iteration": 2.7452139854431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067759, + "balance_loss_mlp": 1.05331051, + "epoch": 0.8811081185071181, + "flos": 515407878144.0, + "grad_norm": 0.07097224147621632, + "language_loss": 0.76073337, + "learning_rate": 3.661323354789586e-05, + "loss": 0.77141094, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.14440918, + "step": 4580, + "time_per_iteration": 2.6742916107177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066849, + "balance_loss_mlp": 1.05221033, + "epoch": 0.8813005001923817, + "flos": 594343236096.0, + "grad_norm": 0.11678051247214369, + "language_loss": 0.81309009, + "learning_rate": 3.649630180424191e-05, + "loss": 0.8237586, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.1463623, + "step": 4581, + "time_per_iteration": 2.676151752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106574, + "balance_loss_mlp": 1.05135107, + "epoch": 0.8814928818776453, + "flos": 666940843008.0, + "grad_norm": 0.07866838173150745, + "language_loss": 0.78949201, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80014944, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.1439209, + "step": 4582, + "time_per_iteration": 2.841001510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064783, + "balance_loss_mlp": 1.0505619, + "epoch": 0.8816852635629088, + "flos": 609153343488.0, + "grad_norm": 0.08084171003417935, + "language_loss": 0.85922098, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86986876, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.14221191, + "step": 4583, + "time_per_iteration": 2.817744016647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067996, + "balance_loss_mlp": 1.05326128, + "epoch": 0.8818776452481724, + "flos": 480379009536.0, + "grad_norm": 0.08737806044600016, + "language_loss": 0.81773436, + "learning_rate": 3.614658644308572e-05, + "loss": 0.82841432, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.14709473, + "step": 4584, + "time_per_iteration": 2.697969913482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073178, + "balance_loss_mlp": 1.05840755, + "epoch": 0.882070026933436, + "flos": 1045394242560.0, + "grad_norm": 0.07560542968481543, + "language_loss": 0.73435783, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74508959, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.14758301, + "step": 4585, + "time_per_iteration": 3.3223116397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062597, + "balance_loss_mlp": 1.04780316, + "epoch": 0.8822624086186995, + "flos": 474409446912.0, + "grad_norm": 0.06954120995359621, + "language_loss": 0.79935622, + "learning_rate": 3.591434321288345e-05, + "loss": 0.80998224, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.14770508, + "step": 4586, + "time_per_iteration": 2.6584787368774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063544, + "balance_loss_mlp": 1.04922748, + "epoch": 0.882454790303963, + "flos": 654023434752.0, + "grad_norm": 0.0731006388758823, + "language_loss": 0.81770998, + "learning_rate": 3.579849183630485e-05, + "loss": 0.82834542, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.14331055, + "step": 4587, + "time_per_iteration": 2.8163564205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062533, + "balance_loss_mlp": 1.0481801, + "epoch": 0.8826471719892266, + "flos": 470325242880.0, + "grad_norm": 0.1045221274060957, + "language_loss": 0.78476524, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79539055, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.14355469, + "step": 4588, + "time_per_iteration": 2.5708115100860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064335, + "balance_loss_mlp": 1.04931498, + "epoch": 0.8828395536744902, + "flos": 468753744384.0, + "grad_norm": 0.06849748948531013, + "language_loss": 0.83737075, + "learning_rate": 3.556732978508048e-05, + "loss": 0.84801412, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.15014648, + "step": 4589, + "time_per_iteration": 2.7350192070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066195, + "balance_loss_mlp": 1.05163944, + "epoch": 0.8830319353597538, + "flos": 721377944064.0, + "grad_norm": 0.09265144488683381, + "language_loss": 0.81130779, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82196975, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.14550781, + "step": 4590, + "time_per_iteration": 2.9506759643554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063865, + "balance_loss_mlp": 1.04927421, + "epoch": 0.8832243170450174, + "flos": 443277204480.0, + "grad_norm": 0.07536545400384899, + "language_loss": 0.8124311, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.82306975, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.14599609, + "step": 4591, + "time_per_iteration": 2.568519353866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066816, + "balance_loss_mlp": 1.05218911, + "epoch": 0.8834166987302808, + "flos": 566583413760.0, + "grad_norm": 0.07974593182180129, + "language_loss": 0.82008839, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83075655, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.14611816, + "step": 4592, + "time_per_iteration": 2.74800968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064472, + "balance_loss_mlp": 1.04994082, + "epoch": 0.8836090804155444, + "flos": 609316328448.0, + "grad_norm": 0.08282529824241759, + "language_loss": 0.81985712, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83050191, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.14538574, + "step": 4593, + "time_per_iteration": 2.797036647796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062162, + "balance_loss_mlp": 1.04726076, + "epoch": 0.883801462100808, + "flos": 557065963008.0, + "grad_norm": 0.07056382399826802, + "language_loss": 0.8015058, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81212735, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.14880371, + "step": 4594, + "time_per_iteration": 2.7425427436828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062451, + "balance_loss_mlp": 1.04733491, + "epoch": 0.8839938437860716, + "flos": 516188671488.0, + "grad_norm": 0.07638624287700241, + "language_loss": 0.77165449, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78227895, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.15100098, + "step": 4595, + "time_per_iteration": 2.6234378814697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058543, + "balance_loss_mlp": 1.04419065, + "epoch": 0.8841862254713351, + "flos": 713696292864.0, + "grad_norm": 0.06917902980564926, + "language_loss": 0.78930062, + "learning_rate": 3.47639446766777e-05, + "loss": 0.79988611, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.14343262, + "step": 4596, + "time_per_iteration": 2.9058618545532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010623, + "balance_loss_mlp": 1.04753017, + "epoch": 0.8843786071565987, + "flos": 833975875584.0, + "grad_norm": 0.0690866392470046, + "language_loss": 0.82326406, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.8338871, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.14746094, + "step": 4597, + "time_per_iteration": 3.100800037384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064515, + "balance_loss_mlp": 1.04985189, + "epoch": 0.8845709888418622, + "flos": 656884505088.0, + "grad_norm": 0.058588985333644296, + "language_loss": 0.82849264, + "learning_rate": 3.453603099349462e-05, + "loss": 0.83913779, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.14648438, + "step": 4598, + "time_per_iteration": 2.9068336486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060442, + "balance_loss_mlp": 1.0461247, + "epoch": 0.8847633705271258, + "flos": 523326666240.0, + "grad_norm": 0.06896375109590577, + "language_loss": 0.80785215, + "learning_rate": 3.442234519350823e-05, + "loss": 0.81845653, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.14306641, + "step": 4599, + "time_per_iteration": 2.7556896209716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062174, + "balance_loss_mlp": 1.04742825, + "epoch": 0.8849557522123894, + "flos": 548591035392.0, + "grad_norm": 0.07253846816892973, + "language_loss": 0.84080333, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85142505, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.1472168, + "step": 4600, + "time_per_iteration": 2.676515579223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070121, + "balance_loss_mlp": 1.05523205, + "epoch": 0.8851481338976529, + "flos": 622372128768.0, + "grad_norm": 0.06699295131360646, + "language_loss": 0.83428752, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84498876, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.14868164, + "step": 4601, + "time_per_iteration": 2.7971203327178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064622, + "balance_loss_mlp": 1.0499115, + "epoch": 0.8853405155829165, + "flos": 444359374848.0, + "grad_norm": 0.05944200349893636, + "language_loss": 0.80591571, + "learning_rate": 3.408237248940088e-05, + "loss": 0.81656194, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.14672852, + "step": 4602, + "time_per_iteration": 2.5471625328063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_mlp": 1.04959369, + "epoch": 0.8855328972681801, + "flos": 730470680064.0, + "grad_norm": 0.0684300317652771, + "language_loss": 0.78215384, + "learning_rate": 3.396940996663683e-05, + "loss": 0.79279757, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.14770508, + "step": 4603, + "time_per_iteration": 2.9694807529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061356, + "balance_loss_mlp": 1.04694319, + "epoch": 0.8857252789534437, + "flos": 487376414208.0, + "grad_norm": 0.06851899804046666, + "language_loss": 0.7892375, + "learning_rate": 3.385662837299375e-05, + "loss": 0.79985106, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.14404297, + "step": 4604, + "time_per_iteration": 2.5907418727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064544, + "balance_loss_mlp": 1.0501318, + "epoch": 0.8859176606387072, + "flos": 508556206080.0, + "grad_norm": 0.07804226376806674, + "language_loss": 0.81699598, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82764149, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.14404297, + "step": 4605, + "time_per_iteration": 2.71748685836792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066417, + "balance_loss_mlp": 1.05155182, + "epoch": 0.8861100423239707, + "flos": 516628440576.0, + "grad_norm": 0.07418647332566988, + "language_loss": 0.85657847, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.86724257, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.14831543, + "step": 4606, + "time_per_iteration": 2.668240547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065286, + "balance_loss_mlp": 1.05090928, + "epoch": 0.8863024240092343, + "flos": 626975396352.0, + "grad_norm": 0.07368984647111583, + "language_loss": 0.79516572, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80581862, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.14367676, + "step": 4607, + "time_per_iteration": 2.7733378410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062225, + "balance_loss_mlp": 1.04724002, + "epoch": 0.8864948056944979, + "flos": 766910260224.0, + "grad_norm": 0.06561105388045792, + "language_loss": 0.83195376, + "learning_rate": 3.340731216429083e-05, + "loss": 0.84257591, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.14953613, + "step": 4608, + "time_per_iteration": 2.9877283573150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100936, + "balance_loss_mlp": 1.00301838, + "epoch": 0.8866871873797615, + "flos": 1502331452928.0, + "grad_norm": 0.00781784977346765, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.7984032, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.06347656, + "step": 4609, + "time_per_iteration": 4.844639301300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066515, + "balance_loss_mlp": 1.05162632, + "epoch": 0.886879569065025, + "flos": 811516050432.0, + "grad_norm": 0.08224338337652813, + "language_loss": 0.81893122, + "learning_rate": 3.3183740769755e-05, + "loss": 0.8295964, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.14868164, + "step": 4610, + "time_per_iteration": 3.0459225177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008504, + "balance_loss_mlp": 1.00216174, + "epoch": 0.8870719507502886, + "flos": 1582838309376.0, + "grad_norm": 0.007754182988627756, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.7791934, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.06347656, + "step": 4611, + "time_per_iteration": 4.932186841964722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065487, + "balance_loss_mlp": 1.05087233, + "epoch": 0.8872643324355521, + "flos": 634027129344.0, + "grad_norm": 0.08360001369051026, + "language_loss": 0.7498275, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76048243, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.14599609, + "step": 4612, + "time_per_iteration": 2.8011648654937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061275, + "balance_loss_mlp": 1.04660034, + "epoch": 0.8874567141208157, + "flos": 535755119616.0, + "grad_norm": 0.08952554638060775, + "language_loss": 0.83154523, + "learning_rate": 3.284974304209532e-05, + "loss": 0.84215796, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.14660645, + "step": 4613, + "time_per_iteration": 2.609548330307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062476, + "balance_loss_mlp": 1.04796779, + "epoch": 0.8876490958060793, + "flos": 1566302552064.0, + "grad_norm": 0.06343704124989273, + "language_loss": 0.79367721, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80430192, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.14489746, + "step": 4614, + "time_per_iteration": 3.8918566703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063989, + "balance_loss_mlp": 1.04920697, + "epoch": 0.8878414774913428, + "flos": 636633810432.0, + "grad_norm": 0.06419240739581336, + "language_loss": 0.84827816, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.85891807, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.14758301, + "step": 4615, + "time_per_iteration": 2.793135643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_mlp": 1.0492754, + "epoch": 0.8880338591766064, + "flos": 496429502976.0, + "grad_norm": 0.09309272937962464, + "language_loss": 0.81416452, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82480693, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.14953613, + "step": 4616, + "time_per_iteration": 2.5885441303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_mlp": 1.04276133, + "epoch": 0.88822624086187, + "flos": 542861180928.0, + "grad_norm": 0.06822196575636882, + "language_loss": 0.79946053, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.81003511, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.14672852, + "step": 4617, + "time_per_iteration": 2.6519088745117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058923, + "balance_loss_mlp": 1.04404545, + "epoch": 0.8884186225471336, + "flos": 551822865408.0, + "grad_norm": 0.09412059292181414, + "language_loss": 0.83855939, + "learning_rate": 3.229670801173418e-05, + "loss": 0.84914863, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.14855957, + "step": 4618, + "time_per_iteration": 2.6311991214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009378, + "balance_loss_mlp": 1.0030365, + "epoch": 0.888611004232397, + "flos": 1565263305216.0, + "grad_norm": 0.009639459863935263, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79521573, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.06347656, + "step": 4619, + "time_per_iteration": 5.020185232162476 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_mlp": 1.05006814, + "epoch": 0.8888033859176606, + "flos": 767028828672.0, + "grad_norm": 0.06660971873423523, + "language_loss": 0.8234545, + "learning_rate": 3.207676474914301e-05, + "loss": 0.8340987, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.14343262, + "step": 4620, + "time_per_iteration": 3.0802297592163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058079, + "balance_loss_mlp": 1.04369044, + "epoch": 0.8889957676029242, + "flos": 934110849024.0, + "grad_norm": 0.07396102044579353, + "language_loss": 0.84266019, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85324097, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.14379883, + "step": 4621, + "time_per_iteration": 3.201620578765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066988, + "balance_loss_mlp": 1.05226541, + "epoch": 0.8891881492881878, + "flos": 589611488256.0, + "grad_norm": 0.06887595273233507, + "language_loss": 0.81630599, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82697588, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.14709473, + "step": 4622, + "time_per_iteration": 2.7901487350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064252, + "balance_loss_mlp": 1.04972029, + "epoch": 0.8893805309734514, + "flos": 540718861824.0, + "grad_norm": 0.08404775125115564, + "language_loss": 0.82411218, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83475471, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.14526367, + "step": 4623, + "time_per_iteration": 2.7144739627838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062011, + "balance_loss_mlp": 1.04719353, + "epoch": 0.8895729126587149, + "flos": 560095160832.0, + "grad_norm": 0.07528407764846204, + "language_loss": 0.81692713, + "learning_rate": 3.163905853111054e-05, + "loss": 0.82754725, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.14794922, + "step": 4624, + "time_per_iteration": 2.684248447418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068554, + "balance_loss_mlp": 1.05377233, + "epoch": 0.8897652943439784, + "flos": 610154021376.0, + "grad_norm": 0.07526595335560629, + "language_loss": 0.81158483, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82227045, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.14758301, + "step": 4625, + "time_per_iteration": 2.745210886001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060583, + "balance_loss_mlp": 1.04557419, + "epoch": 0.889957676029242, + "flos": 917847811584.0, + "grad_norm": 0.07027542614256606, + "language_loss": 0.77104485, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78165066, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.15002441, + "step": 4626, + "time_per_iteration": 3.2061305046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067065, + "balance_loss_mlp": 1.05217612, + "epoch": 0.8901500577145056, + "flos": 488698292736.0, + "grad_norm": 0.0704878908918983, + "language_loss": 0.8078301, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81850064, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.14855957, + "step": 4627, + "time_per_iteration": 2.607419013977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061017, + "balance_loss_mlp": 1.0462352, + "epoch": 0.8903424393997691, + "flos": 733648181760.0, + "grad_norm": 0.07540128325428244, + "language_loss": 0.80532998, + "learning_rate": 3.120426165316398e-05, + "loss": 0.81594014, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.14770508, + "step": 4628, + "time_per_iteration": 3.0157666206359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_mlp": 1.04630482, + "epoch": 0.8905348210850327, + "flos": 519813282816.0, + "grad_norm": 0.06608891713828716, + "language_loss": 0.81858778, + "learning_rate": 3.109601733496881e-05, + "loss": 0.82919651, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.14562988, + "step": 4629, + "time_per_iteration": 2.6610121726989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063368, + "balance_loss_mlp": 1.04870582, + "epoch": 0.8907272027702963, + "flos": 578976989184.0, + "grad_norm": 0.06357905643630052, + "language_loss": 0.79617715, + "learning_rate": 3.098795506144458e-05, + "loss": 0.80681086, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.14648438, + "step": 4630, + "time_per_iteration": 2.818662405014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061916, + "balance_loss_mlp": 1.04731333, + "epoch": 0.8909195844555599, + "flos": 893628910080.0, + "grad_norm": 0.08011777081386978, + "language_loss": 0.79218996, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80280912, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.14599609, + "step": 4631, + "time_per_iteration": 3.111326217651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065393, + "balance_loss_mlp": 1.0505271, + "epoch": 0.8911119661408234, + "flos": 549865926144.0, + "grad_norm": 0.07451695723155916, + "language_loss": 0.84347403, + "learning_rate": 3.077237681615208e-05, + "loss": 0.854128, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.14855957, + "step": 4632, + "time_per_iteration": 2.654611349105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062727, + "balance_loss_mlp": 1.04776609, + "epoch": 0.8913043478260869, + "flos": 481139979264.0, + "grad_norm": 0.1272094121243378, + "language_loss": 0.83604395, + "learning_rate": 3.066486092807874e-05, + "loss": 0.84667122, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.14941406, + "step": 4633, + "time_per_iteration": 2.789865732192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066836, + "balance_loss_mlp": 1.05250716, + "epoch": 0.8914967295113505, + "flos": 484581782016.0, + "grad_norm": 0.06541426651234629, + "language_loss": 0.85132289, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86199123, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.14331055, + "step": 4634, + "time_per_iteration": 2.717449426651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062211, + "balance_loss_mlp": 1.04766774, + "epoch": 0.8916891111966141, + "flos": 445664001024.0, + "grad_norm": 0.0775971104699302, + "language_loss": 0.81119001, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82181215, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.14538574, + "step": 4635, + "time_per_iteration": 2.561291456222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062854, + "balance_loss_mlp": 1.04850113, + "epoch": 0.8918814928818777, + "flos": 564016379904.0, + "grad_norm": 0.06339714050321119, + "language_loss": 0.78017372, + "learning_rate": 3.034340670283453e-05, + "loss": 0.7908023, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.14343262, + "step": 4636, + "time_per_iteration": 2.745828151702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061938, + "balance_loss_mlp": 1.04759729, + "epoch": 0.8920738745671412, + "flos": 575943022080.0, + "grad_norm": 0.06775323964020447, + "language_loss": 0.81232381, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82294321, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.14343262, + "step": 4637, + "time_per_iteration": 2.7148401737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058592, + "balance_loss_mlp": 1.04469275, + "epoch": 0.8922662562524047, + "flos": 620180623872.0, + "grad_norm": 0.07445586698610686, + "language_loss": 0.84255946, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.8531453, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.13916016, + "step": 4638, + "time_per_iteration": 2.7227747440338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065871, + "balance_loss_mlp": 1.05116081, + "epoch": 0.8924586379376683, + "flos": 583624673280.0, + "grad_norm": 0.06905475255164643, + "language_loss": 0.79193419, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80259293, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.14709473, + "step": 4639, + "time_per_iteration": 2.788235664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061668, + "balance_loss_mlp": 1.04706538, + "epoch": 0.8926510196229319, + "flos": 525177520128.0, + "grad_norm": 0.05686103058965172, + "language_loss": 0.81477505, + "learning_rate": 2.991735397786538e-05, + "loss": 0.82539171, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.14599609, + "step": 4640, + "time_per_iteration": 2.7929677963256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063545, + "balance_loss_mlp": 1.04906142, + "epoch": 0.8928434013081955, + "flos": 486669772800.0, + "grad_norm": 0.07327092814585671, + "language_loss": 0.8064788, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81711423, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.14465332, + "step": 4641, + "time_per_iteration": 2.547323226928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008945, + "balance_loss_mlp": 1.00260293, + "epoch": 0.893035782993459, + "flos": 1448302560768.0, + "grad_norm": 0.006067492083133456, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.813398, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.06347656, + "step": 4642, + "time_per_iteration": 4.66918683052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_mlp": 1.0525589, + "epoch": 0.8932281646787226, + "flos": 611320255488.0, + "grad_norm": 0.08930531344509365, + "language_loss": 0.806961, + "learning_rate": 2.95997305629786e-05, + "loss": 0.8176316, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.14489746, + "step": 4643, + "time_per_iteration": 2.7685227394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069165, + "balance_loss_mlp": 1.05447841, + "epoch": 0.8934205463639862, + "flos": 565760775168.0, + "grad_norm": 0.07686935082063327, + "language_loss": 0.84716517, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85785675, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.14660645, + "step": 4644, + "time_per_iteration": 2.671163320541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071165, + "balance_loss_mlp": 1.05693138, + "epoch": 0.8936129280492497, + "flos": 488431420416.0, + "grad_norm": 0.07237412632406065, + "language_loss": 0.77936012, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79007179, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.14245605, + "step": 4645, + "time_per_iteration": 2.603137731552124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068007, + "balance_loss_mlp": 1.05332017, + "epoch": 0.8938053097345132, + "flos": 886490542080.0, + "grad_norm": 0.07522548933952772, + "language_loss": 0.80461609, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81529617, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.14672852, + "step": 4646, + "time_per_iteration": 3.280094623565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066064, + "balance_loss_mlp": 1.05165184, + "epoch": 0.8939976914197768, + "flos": 593285658624.0, + "grad_norm": 0.08219366473251573, + "language_loss": 0.83988786, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.85054851, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.14404297, + "step": 4647, + "time_per_iteration": 2.7825815677642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065827, + "balance_loss_mlp": 1.05093789, + "epoch": 0.8941900731050404, + "flos": 523247745024.0, + "grad_norm": 0.07496651199822538, + "language_loss": 0.81140471, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.82206297, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.14855957, + "step": 4648, + "time_per_iteration": 2.63409686088562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063595, + "balance_loss_mlp": 1.04921877, + "epoch": 0.894382454790304, + "flos": 800582745600.0, + "grad_norm": 0.06368180077002604, + "language_loss": 0.81087399, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.82150996, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.14367676, + "step": 4649, + "time_per_iteration": 3.0075058937072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065601, + "balance_loss_mlp": 1.05089021, + "epoch": 0.8945748364755676, + "flos": 479037307392.0, + "grad_norm": 0.075834985070335, + "language_loss": 0.84640402, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.85705996, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.14697266, + "step": 4650, + "time_per_iteration": 2.6034011840820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106995, + "balance_loss_mlp": 1.0558238, + "epoch": 0.894767218160831, + "flos": 508776090624.0, + "grad_norm": 0.07759752501155412, + "language_loss": 0.83263576, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84333521, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.14123535, + "step": 4651, + "time_per_iteration": 2.705700635910034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062492, + "balance_loss_mlp": 1.04778171, + "epoch": 0.8949595998460946, + "flos": 685857549312.0, + "grad_norm": 0.07437002469966474, + "language_loss": 0.81665766, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.82728255, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.14685059, + "step": 4652, + "time_per_iteration": 2.8570845127105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063896, + "balance_loss_mlp": 1.04962647, + "epoch": 0.8951519815313582, + "flos": 799920520704.0, + "grad_norm": 0.0793410947413692, + "language_loss": 0.7713061, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78194505, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.1427002, + "step": 4653, + "time_per_iteration": 3.003610610961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_mlp": 1.05177677, + "epoch": 0.8953443632166218, + "flos": 666740782080.0, + "grad_norm": 0.07417541500782357, + "language_loss": 0.86446422, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87512767, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.14550781, + "step": 4654, + "time_per_iteration": 2.797839403152466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074989, + "balance_loss_mlp": 1.0601716, + "epoch": 0.8955367449018854, + "flos": 644977686528.0, + "grad_norm": 0.07248317296811377, + "language_loss": 0.83017957, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84092951, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.14819336, + "step": 4655, + "time_per_iteration": 2.8501458168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069, + "balance_loss_mlp": 1.05457556, + "epoch": 0.8957291265871489, + "flos": 808714077696.0, + "grad_norm": 0.08493016666071657, + "language_loss": 0.77622211, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78691208, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.14416504, + "step": 4656, + "time_per_iteration": 3.0544345378875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072691, + "balance_loss_mlp": 1.05780149, + "epoch": 0.8959215082724125, + "flos": 518923832832.0, + "grad_norm": 0.06895909454077155, + "language_loss": 0.77396119, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78468812, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.14880371, + "step": 4657, + "time_per_iteration": 2.625452995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062071, + "balance_loss_mlp": 1.04775393, + "epoch": 0.896113889957676, + "flos": 476917383168.0, + "grad_norm": 0.0682470588082175, + "language_loss": 0.770661, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78128171, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.14318848, + "step": 4658, + "time_per_iteration": 2.644498348236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068233, + "balance_loss_mlp": 1.05376089, + "epoch": 0.8963062716429396, + "flos": 518162863104.0, + "grad_norm": 0.08115976152175236, + "language_loss": 0.83185726, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84253961, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.14453125, + "step": 4659, + "time_per_iteration": 2.6564247608184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072802, + "balance_loss_mlp": 1.05815101, + "epoch": 0.8964986533282031, + "flos": 508484625408.0, + "grad_norm": 0.07390456564378446, + "language_loss": 0.81757265, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82830071, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.1463623, + "step": 4660, + "time_per_iteration": 2.782780170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068951, + "balance_loss_mlp": 1.05434823, + "epoch": 0.8966910350134667, + "flos": 536076320256.0, + "grad_norm": 0.08706141368322202, + "language_loss": 0.81532872, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82601821, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.14575195, + "step": 4661, + "time_per_iteration": 2.6224963665008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069438, + "balance_loss_mlp": 1.05518055, + "epoch": 0.8968834166987303, + "flos": 723226226688.0, + "grad_norm": 0.08580676839131798, + "language_loss": 0.84018874, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85088313, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.14257812, + "step": 4662, + "time_per_iteration": 2.8570454120635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_mlp": 1.05049145, + "epoch": 0.8970757983839939, + "flos": 681686710272.0, + "grad_norm": 0.060610777722161696, + "language_loss": 0.83853519, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.84918392, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.14355469, + "step": 4663, + "time_per_iteration": 2.920295476913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_mlp": 1.04879189, + "epoch": 0.8972681800692575, + "flos": 613037486592.0, + "grad_norm": 0.0958798817779249, + "language_loss": 0.75815916, + "learning_rate": 2.742244971856006e-05, + "loss": 0.76878953, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.14257812, + "step": 4664, + "time_per_iteration": 2.734210729598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067954, + "balance_loss_mlp": 1.05330276, + "epoch": 0.8974605617545209, + "flos": 572350344192.0, + "grad_norm": 0.06671630695450127, + "language_loss": 0.83153635, + "learning_rate": 2.732078493352913e-05, + "loss": 0.8422159, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.14624023, + "step": 4665, + "time_per_iteration": 2.785287857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067553, + "balance_loss_mlp": 1.05297387, + "epoch": 0.8976529434397845, + "flos": 520418608128.0, + "grad_norm": 0.07070738128757356, + "language_loss": 0.87607473, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88675022, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.14575195, + "step": 4666, + "time_per_iteration": 2.703206777572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069921, + "balance_loss_mlp": 1.0556159, + "epoch": 0.8978453251250481, + "flos": 471355656192.0, + "grad_norm": 0.06387672744087973, + "language_loss": 0.82552743, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83622664, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.14282227, + "step": 4667, + "time_per_iteration": 2.6351258754730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065876, + "balance_loss_mlp": 1.05146372, + "epoch": 0.8980377068103117, + "flos": 591659831808.0, + "grad_norm": 0.05668942470772938, + "language_loss": 0.81973124, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.83038998, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.14416504, + "step": 4668, + "time_per_iteration": 2.787976026535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069796, + "balance_loss_mlp": 1.05552649, + "epoch": 0.8982300884955752, + "flos": 767619472896.0, + "grad_norm": 0.08326371614499664, + "language_loss": 0.82582569, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83652365, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.1427002, + "step": 4669, + "time_per_iteration": 2.969316244125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067174, + "balance_loss_mlp": 1.05295277, + "epoch": 0.8984224701808388, + "flos": 844575496704.0, + "grad_norm": 0.06732596240846979, + "language_loss": 0.77453232, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78520411, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.14208984, + "step": 4670, + "time_per_iteration": 3.223684549331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065272, + "balance_loss_mlp": 1.0510509, + "epoch": 0.8986148518661023, + "flos": 757661879808.0, + "grad_norm": 0.07787101362548385, + "language_loss": 0.75908744, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.76974022, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.14221191, + "step": 4671, + "time_per_iteration": 3.1525230407714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063641, + "balance_loss_mlp": 1.04913378, + "epoch": 0.8988072335513659, + "flos": 563070030336.0, + "grad_norm": 0.07908163897949186, + "language_loss": 0.76794827, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.77858472, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.14489746, + "step": 4672, + "time_per_iteration": 2.696798801422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069201, + "balance_loss_mlp": 1.05497932, + "epoch": 0.8989996152366295, + "flos": 492683751936.0, + "grad_norm": 0.0696282834458535, + "language_loss": 0.86689526, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87758726, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.14208984, + "step": 4673, + "time_per_iteration": 2.608199119567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069189, + "balance_loss_mlp": 1.05446625, + "epoch": 0.899191996921893, + "flos": 542567144448.0, + "grad_norm": 0.0815899420947763, + "language_loss": 0.75713086, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.7678228, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.14697266, + "step": 4674, + "time_per_iteration": 2.6534104347229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063878, + "balance_loss_mlp": 1.04948986, + "epoch": 0.8993843786071566, + "flos": 471325920768.0, + "grad_norm": 0.09288990193845198, + "language_loss": 0.79658222, + "learning_rate": 2.631423662948984e-05, + "loss": 0.80722106, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.1439209, + "step": 4675, + "time_per_iteration": 2.5522913932800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066799, + "balance_loss_mlp": 1.05254185, + "epoch": 0.8995767602924202, + "flos": 526726623744.0, + "grad_norm": 0.07376998741861143, + "language_loss": 0.82278091, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83344889, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.14245605, + "step": 4676, + "time_per_iteration": 2.744189739227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067895, + "balance_loss_mlp": 1.05380487, + "epoch": 0.8997691419776838, + "flos": 557634212352.0, + "grad_norm": 0.07732253278752255, + "language_loss": 0.84530777, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.85598671, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.14099121, + "step": 4677, + "time_per_iteration": 2.687650203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068556, + "balance_loss_mlp": 1.05438173, + "epoch": 0.8999615236629472, + "flos": 639027947520.0, + "grad_norm": 0.07249769601440123, + "language_loss": 0.80352259, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81420815, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.1418457, + "step": 4678, + "time_per_iteration": 2.8250062465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010174, + "balance_loss_mlp": 1.00383258, + "epoch": 0.9001539053482108, + "flos": 1431510547968.0, + "grad_norm": 0.0048517691519101465, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86794198, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.06347656, + "step": 4679, + "time_per_iteration": 4.8175883293151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069083, + "balance_loss_mlp": 1.05459857, + "epoch": 0.9003462870334744, + "flos": 566877450240.0, + "grad_norm": 0.07910377737039828, + "language_loss": 0.79715955, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.80785036, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.14501953, + "step": 4680, + "time_per_iteration": 2.88101863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066488, + "balance_loss_mlp": 1.05164611, + "epoch": 0.900538668718738, + "flos": 538655837184.0, + "grad_norm": 0.09077574273759434, + "language_loss": 0.784284, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79494882, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.14819336, + "step": 4681, + "time_per_iteration": 2.6530046463012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065401, + "balance_loss_mlp": 1.05086935, + "epoch": 0.9007310504040016, + "flos": 488387003904.0, + "grad_norm": 0.09481128495773143, + "language_loss": 0.85749257, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.86814654, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.14526367, + "step": 4682, + "time_per_iteration": 2.5924911499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069464, + "balance_loss_mlp": 1.05537307, + "epoch": 0.9009234320892651, + "flos": 652901617152.0, + "grad_norm": 0.07242356929765614, + "language_loss": 0.78751016, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79820478, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.14086914, + "step": 4683, + "time_per_iteration": 2.8851335048675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069143, + "balance_loss_mlp": 1.05439687, + "epoch": 0.9011158137745287, + "flos": 545569178112.0, + "grad_norm": 0.05948760024031309, + "language_loss": 0.85193956, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.86263096, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.1472168, + "step": 4684, + "time_per_iteration": 2.6724307537078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060384, + "balance_loss_mlp": 1.04620993, + "epoch": 0.9013081954597922, + "flos": 559699808256.0, + "grad_norm": 0.06866797250942824, + "language_loss": 0.82781953, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83842337, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.14160156, + "step": 4685, + "time_per_iteration": 2.739715099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_mlp": 1.05133152, + "epoch": 0.9015005771450558, + "flos": 728652132864.0, + "grad_norm": 0.06366441246454302, + "language_loss": 0.81115925, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82181865, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.14599609, + "step": 4686, + "time_per_iteration": 2.9184703826904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065827, + "balance_loss_mlp": 1.05153358, + "epoch": 0.9016929588303193, + "flos": 517416574464.0, + "grad_norm": 0.07207113969320707, + "language_loss": 0.81100535, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.82166362, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.14294434, + "step": 4687, + "time_per_iteration": 2.788123369216919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065738, + "balance_loss_mlp": 1.05131364, + "epoch": 0.9018853405155829, + "flos": 622335052800.0, + "grad_norm": 0.08582414251265837, + "language_loss": 0.85935158, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87000895, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.14416504, + "step": 4688, + "time_per_iteration": 2.828031301498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065081, + "balance_loss_mlp": 1.05029953, + "epoch": 0.9020777222008465, + "flos": 523284820992.0, + "grad_norm": 0.07001634652504764, + "language_loss": 0.77557302, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.78622389, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.14758301, + "step": 4689, + "time_per_iteration": 2.628683567047119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061017, + "balance_loss_mlp": 1.04687846, + "epoch": 0.9022701038861101, + "flos": 633713269248.0, + "grad_norm": 0.058017175212302395, + "language_loss": 0.81781268, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.82842284, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.14135742, + "step": 4690, + "time_per_iteration": 2.9031572341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067164, + "balance_loss_mlp": 1.05271626, + "epoch": 0.9024624855713737, + "flos": 513295294464.0, + "grad_norm": 0.08913945678563855, + "language_loss": 0.84359717, + "learning_rate": 2.474202664305253e-05, + "loss": 0.85426879, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.14428711, + "step": 4691, + "time_per_iteration": 2.631625175476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065723, + "balance_loss_mlp": 1.05104828, + "epoch": 0.9026548672566371, + "flos": 477411480576.0, + "grad_norm": 0.07129062620509946, + "language_loss": 0.86470556, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87536281, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.14660645, + "step": 4692, + "time_per_iteration": 2.6019630432128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106471, + "balance_loss_mlp": 1.05016685, + "epoch": 0.9028472489419007, + "flos": 661994353152.0, + "grad_norm": 0.06751433546030572, + "language_loss": 0.73846859, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74911571, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.14526367, + "step": 4693, + "time_per_iteration": 2.8628439903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070243, + "balance_loss_mlp": 1.05603313, + "epoch": 0.9030396306271643, + "flos": 534588885504.0, + "grad_norm": 0.08037251387372839, + "language_loss": 0.8173911, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.82809353, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.14208984, + "step": 4694, + "time_per_iteration": 2.6520001888275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071097, + "balance_loss_mlp": 1.05644655, + "epoch": 0.9032320123124279, + "flos": 801032426496.0, + "grad_norm": 0.06679419876391568, + "language_loss": 0.82209843, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83280945, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.1463623, + "step": 4695, + "time_per_iteration": 2.9608535766601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064857, + "balance_loss_mlp": 1.05057585, + "epoch": 0.9034243939976914, + "flos": 553942789632.0, + "grad_norm": 0.07597748823489225, + "language_loss": 0.76713479, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77778327, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.14294434, + "step": 4696, + "time_per_iteration": 2.6596148014068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065816, + "balance_loss_mlp": 1.05115342, + "epoch": 0.903616775682955, + "flos": 503903752704.0, + "grad_norm": 0.08113286673515856, + "language_loss": 0.82694674, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83760482, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.1463623, + "step": 4697, + "time_per_iteration": 2.583207845687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066346, + "balance_loss_mlp": 1.05205238, + "epoch": 0.9038091573682185, + "flos": 436297052160.0, + "grad_norm": 0.06372833149732707, + "language_loss": 0.78702718, + "learning_rate": 2.406902878347017e-05, + "loss": 0.79769063, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.14294434, + "step": 4698, + "time_per_iteration": 2.6969242095947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068087, + "balance_loss_mlp": 1.0532459, + "epoch": 0.9040015390534821, + "flos": 532916070912.0, + "grad_norm": 0.10371766305830984, + "language_loss": 0.81603229, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.8267132, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.14807129, + "step": 4699, + "time_per_iteration": 2.715841054916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_mlp": 1.05056787, + "epoch": 0.9041939207387457, + "flos": 564307845120.0, + "grad_norm": 0.07382898909247483, + "language_loss": 0.80341852, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.81406426, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.14025879, + "step": 4700, + "time_per_iteration": 2.777735948562622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066468, + "balance_loss_mlp": 1.05236554, + "epoch": 0.9043863024240092, + "flos": 515509194240.0, + "grad_norm": 0.08219031105158194, + "language_loss": 0.77413332, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78479803, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.14099121, + "step": 4701, + "time_per_iteration": 2.5898244380950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006479, + "balance_loss_mlp": 1.00013745, + "epoch": 0.9045786841092728, + "flos": 1277949063168.0, + "grad_norm": 0.004346431205568835, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73936266, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.06347656, + "step": 4702, + "time_per_iteration": 4.976499557495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069366, + "balance_loss_mlp": 1.05514455, + "epoch": 0.9047710657945364, + "flos": 585841144320.0, + "grad_norm": 0.09422512438722834, + "language_loss": 0.82765079, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.83834445, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.14221191, + "step": 4703, + "time_per_iteration": 2.694584369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059188, + "balance_loss_mlp": 1.04433465, + "epoch": 0.9049634474798, + "flos": 571937739264.0, + "grad_norm": 0.07632132542953685, + "language_loss": 0.79691899, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80751085, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.14831543, + "step": 4704, + "time_per_iteration": 2.7374324798583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065528, + "balance_loss_mlp": 1.05164027, + "epoch": 0.9051558291650635, + "flos": 572619787776.0, + "grad_norm": 0.09869454063835598, + "language_loss": 0.74012047, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75077575, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.13916016, + "step": 4705, + "time_per_iteration": 2.6923489570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061143, + "balance_loss_mlp": 1.04684973, + "epoch": 0.905348210850327, + "flos": 540538624512.0, + "grad_norm": 0.07680020480338727, + "language_loss": 0.7908138, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80142522, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.14282227, + "step": 4706, + "time_per_iteration": 2.7089977264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064406, + "balance_loss_mlp": 1.04993391, + "epoch": 0.9055405925355906, + "flos": 516381391872.0, + "grad_norm": 0.06962007853797926, + "language_loss": 0.81341648, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82406056, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.14453125, + "step": 4707, + "time_per_iteration": 2.5984983444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060615, + "balance_loss_mlp": 1.04610777, + "epoch": 0.9057329742208542, + "flos": 914643145728.0, + "grad_norm": 0.07927977683650914, + "language_loss": 0.84686792, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.85747409, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.14489746, + "step": 4708, + "time_per_iteration": 3.2090601921081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061343, + "balance_loss_mlp": 1.04700196, + "epoch": 0.9059253559061178, + "flos": 905261515776.0, + "grad_norm": 0.07156146868519836, + "language_loss": 0.82681596, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.8374294, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.14343262, + "step": 4709, + "time_per_iteration": 3.144296884536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106027, + "balance_loss_mlp": 1.0454762, + "epoch": 0.9061177375913813, + "flos": 664534222848.0, + "grad_norm": 0.08456755450769048, + "language_loss": 0.77485931, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78546202, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.14770508, + "step": 4710, + "time_per_iteration": 2.905553102493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.04987335, + "epoch": 0.9063101192766448, + "flos": 565609900032.0, + "grad_norm": 0.0740076583450613, + "language_loss": 0.82648456, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.83712578, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.14233398, + "step": 4711, + "time_per_iteration": 2.807823419570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065313, + "balance_loss_mlp": 1.05088818, + "epoch": 0.9065025009619084, + "flos": 727377242112.0, + "grad_norm": 0.07095500667918284, + "language_loss": 0.79161823, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.80227137, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.14404297, + "step": 4712, + "time_per_iteration": 2.9008591175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059623, + "balance_loss_mlp": 1.0455091, + "epoch": 0.906694882647172, + "flos": 531512699904.0, + "grad_norm": 0.07208403118475668, + "language_loss": 0.79940331, + "learning_rate": 2.265739417041418e-05, + "loss": 0.80999959, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.14111328, + "step": 4713, + "time_per_iteration": 2.630858898162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063209, + "balance_loss_mlp": 1.04830837, + "epoch": 0.9068872643324356, + "flos": 429788975616.0, + "grad_norm": 0.08721632216250842, + "language_loss": 0.846187, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.85681909, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.14892578, + "step": 4714, + "time_per_iteration": 2.588636636734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069052, + "balance_loss_mlp": 1.05467498, + "epoch": 0.9070796460176991, + "flos": 588366332928.0, + "grad_norm": 0.07267525765549768, + "language_loss": 0.79764116, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80833167, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.14367676, + "step": 4715, + "time_per_iteration": 2.753297805786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065162, + "balance_loss_mlp": 1.05052352, + "epoch": 0.9072720277029627, + "flos": 571582033920.0, + "grad_norm": 0.13233768252009984, + "language_loss": 0.75550985, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76616144, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.14611816, + "step": 4716, + "time_per_iteration": 2.7039754390716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062213, + "balance_loss_mlp": 1.04836059, + "epoch": 0.9074644093882263, + "flos": 555798412800.0, + "grad_norm": 0.0678711634408688, + "language_loss": 0.88607067, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89669281, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.13867188, + "step": 4717, + "time_per_iteration": 2.653090476989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106287, + "balance_loss_mlp": 1.04805207, + "epoch": 0.9076567910734898, + "flos": 640994798592.0, + "grad_norm": 0.0708629582790745, + "language_loss": 0.82607627, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83670497, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.14794922, + "step": 4718, + "time_per_iteration": 2.7904906272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067779, + "balance_loss_mlp": 1.05324757, + "epoch": 0.9078491727587533, + "flos": 733998744576.0, + "grad_norm": 0.06597864576502437, + "language_loss": 0.8173753, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.82805312, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.1451416, + "step": 4719, + "time_per_iteration": 3.107698678970337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062712, + "balance_loss_mlp": 1.04819226, + "epoch": 0.9080415544440169, + "flos": 654774492672.0, + "grad_norm": 0.06124132734061613, + "language_loss": 0.86642504, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.87705219, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.14489746, + "step": 4720, + "time_per_iteration": 2.853066921234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061154, + "balance_loss_mlp": 1.0462532, + "epoch": 0.9082339361292805, + "flos": 597463838208.0, + "grad_norm": 0.06958086919390859, + "language_loss": 0.79430765, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80491918, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.14892578, + "step": 4721, + "time_per_iteration": 2.7562382221221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_mlp": 1.05006683, + "epoch": 0.9084263178145441, + "flos": 504407761920.0, + "grad_norm": 0.07783183488641256, + "language_loss": 0.84554178, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85618752, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.14489746, + "step": 4722, + "time_per_iteration": 2.6053574085235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063826, + "balance_loss_mlp": 1.04959297, + "epoch": 0.9086186994998077, + "flos": 550031482368.0, + "grad_norm": 0.0692661418085719, + "language_loss": 0.80308425, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.81372249, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.14221191, + "step": 4723, + "time_per_iteration": 2.726078748703003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066428, + "balance_loss_mlp": 1.05238521, + "epoch": 0.9088110811850711, + "flos": 1134076847616.0, + "grad_norm": 0.07360851759863624, + "language_loss": 0.75023186, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76089615, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.14025879, + "step": 4724, + "time_per_iteration": 3.610614776611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066842, + "balance_loss_mlp": 1.05257237, + "epoch": 0.9090034628703347, + "flos": 556991811072.0, + "grad_norm": 0.06736800073919637, + "language_loss": 0.76777983, + "learning_rate": 2.155810244111628e-05, + "loss": 0.77844834, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.1427002, + "step": 4725, + "time_per_iteration": 2.7194221019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066103, + "balance_loss_mlp": 1.05192947, + "epoch": 0.9091958445555983, + "flos": 543970515456.0, + "grad_norm": 0.06425275032512914, + "language_loss": 0.84323931, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85390031, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.1418457, + "step": 4726, + "time_per_iteration": 2.70613169670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068032, + "balance_loss_mlp": 1.0533216, + "epoch": 0.9093882262408619, + "flos": 526113957888.0, + "grad_norm": 0.0798223233721421, + "language_loss": 0.80948919, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82016957, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.14697266, + "step": 4727, + "time_per_iteration": 2.6258621215820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067441, + "balance_loss_mlp": 1.05299282, + "epoch": 0.9095806079261254, + "flos": 548526795264.0, + "grad_norm": 0.0886839073998551, + "language_loss": 0.819561, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.83023536, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.14453125, + "step": 4728, + "time_per_iteration": 2.6196727752685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065639, + "balance_loss_mlp": 1.05156028, + "epoch": 0.909772989611389, + "flos": 572535724032.0, + "grad_norm": 0.0754217444502015, + "language_loss": 0.8483696, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85902596, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.14086914, + "step": 4729, + "time_per_iteration": 2.736675977706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064015, + "balance_loss_mlp": 1.04972172, + "epoch": 0.9099653712966526, + "flos": 561812391936.0, + "grad_norm": 0.07023842602850215, + "language_loss": 0.79529822, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.80593836, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.14294434, + "step": 4730, + "time_per_iteration": 2.682899236679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062771, + "balance_loss_mlp": 1.04809618, + "epoch": 0.9101577529819161, + "flos": 1093800112128.0, + "grad_norm": 0.08043552719518131, + "language_loss": 0.79924774, + "learning_rate": 2.101848311877069e-05, + "loss": 0.80987543, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.14672852, + "step": 4731, + "time_per_iteration": 3.3790597915649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065728, + "balance_loss_mlp": 1.05133891, + "epoch": 0.9103501346671797, + "flos": 445444116480.0, + "grad_norm": 0.11616845268316883, + "language_loss": 0.81709516, + "learning_rate": 2.092919721190678e-05, + "loss": 0.82775241, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.14367676, + "step": 4732, + "time_per_iteration": 2.5050580501556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069513, + "balance_loss_mlp": 1.0552671, + "epoch": 0.9105425163524432, + "flos": 500770667520.0, + "grad_norm": 0.08258993648041235, + "language_loss": 0.77471602, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.78541112, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.14257812, + "step": 4733, + "time_per_iteration": 2.619145393371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_mlp": 1.04723191, + "epoch": 0.9107348980377068, + "flos": 657519565824.0, + "grad_norm": 0.06290926072647557, + "language_loss": 0.84182942, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.85244751, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.14562988, + "step": 4734, + "time_per_iteration": 2.8620665073394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067407, + "balance_loss_mlp": 1.05304253, + "epoch": 0.9109272797229704, + "flos": 553668576768.0, + "grad_norm": 0.06328957974022432, + "language_loss": 0.85179257, + "learning_rate": 2.066245558029256e-05, + "loss": 0.86246669, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.14367676, + "step": 4735, + "time_per_iteration": 2.6373870372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069052, + "balance_loss_mlp": 1.05477083, + "epoch": 0.911119661408234, + "flos": 519007896576.0, + "grad_norm": 0.08501781377109913, + "language_loss": 0.84289479, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85358536, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.14282227, + "step": 4736, + "time_per_iteration": 2.6207847595214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066578, + "balance_loss_mlp": 1.05243933, + "epoch": 0.9113120430934974, + "flos": 554375218176.0, + "grad_norm": 0.07334243332410934, + "language_loss": 0.82774675, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.83841252, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.14135742, + "step": 4737, + "time_per_iteration": 2.735215663909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068558, + "balance_loss_mlp": 1.05442023, + "epoch": 0.911504424778761, + "flos": 501889913856.0, + "grad_norm": 0.08209276637430535, + "language_loss": 0.81173909, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82242465, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.14135742, + "step": 4738, + "time_per_iteration": 2.7108118534088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065634, + "balance_loss_mlp": 1.05147171, + "epoch": 0.9116968064640246, + "flos": 611100370944.0, + "grad_norm": 0.06232841540702404, + "language_loss": 0.82146698, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.83212328, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.14172363, + "step": 4739, + "time_per_iteration": 2.7313618659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065756, + "balance_loss_mlp": 1.0513792, + "epoch": 0.9118891881492882, + "flos": 572918593536.0, + "grad_norm": 0.0711794304309251, + "language_loss": 0.82400596, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.83466357, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.14379883, + "step": 4740, + "time_per_iteration": 2.7886438369750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068105, + "balance_loss_mlp": 1.05344248, + "epoch": 0.9120815698345518, + "flos": 635961673728.0, + "grad_norm": 0.07367219932429818, + "language_loss": 0.77975518, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.79043615, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.14648438, + "step": 4741, + "time_per_iteration": 2.8221640586853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064444, + "balance_loss_mlp": 1.04966211, + "epoch": 0.9122739515198153, + "flos": 702300824064.0, + "grad_norm": 0.09047893023933404, + "language_loss": 0.85824144, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.86888587, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.14758301, + "step": 4742, + "time_per_iteration": 2.846888303756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066977, + "balance_loss_mlp": 1.0525769, + "epoch": 0.9124663332050789, + "flos": 524690763264.0, + "grad_norm": 0.08214101910930026, + "language_loss": 0.87773347, + "learning_rate": 1.995933526832239e-05, + "loss": 0.8884033, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.14404297, + "step": 4743, + "time_per_iteration": 2.61386775970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106707, + "balance_loss_mlp": 1.05287266, + "epoch": 0.9126587148903424, + "flos": 563299826688.0, + "grad_norm": 0.07495266028674485, + "language_loss": 0.82313836, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83380902, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.14196777, + "step": 4744, + "time_per_iteration": 2.675384521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063023, + "balance_loss_mlp": 1.04872966, + "epoch": 0.912851096575606, + "flos": 505942184448.0, + "grad_norm": 0.08022890844288642, + "language_loss": 0.79648215, + "learning_rate": 1.978541819374574e-05, + "loss": 0.8071124, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.14294434, + "step": 4745, + "time_per_iteration": 2.599677562713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065707, + "balance_loss_mlp": 1.05115116, + "epoch": 0.9130434782608695, + "flos": 550730783232.0, + "grad_norm": 0.06424821919191918, + "language_loss": 0.82602614, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83668321, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.14550781, + "step": 4746, + "time_per_iteration": 2.6465249061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069413, + "balance_loss_mlp": 1.05548978, + "epoch": 0.9132358599461331, + "flos": 468976200192.0, + "grad_norm": 0.06465782208543523, + "language_loss": 0.83286655, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84356076, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.13952637, + "step": 4747, + "time_per_iteration": 2.554649829864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060732, + "balance_loss_mlp": 1.04640317, + "epoch": 0.9134282416313967, + "flos": 506097828864.0, + "grad_norm": 0.077833753798199, + "language_loss": 0.79933763, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.80994493, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.14343262, + "step": 4748, + "time_per_iteration": 2.6685752868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067379, + "balance_loss_mlp": 1.05307388, + "epoch": 0.9136206233166603, + "flos": 604819519488.0, + "grad_norm": 0.07038608549494893, + "language_loss": 0.84113944, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.8518132, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.14294434, + "step": 4749, + "time_per_iteration": 2.7418196201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067371, + "balance_loss_mlp": 1.05276763, + "epoch": 0.9138130050019239, + "flos": 561738240000.0, + "grad_norm": 0.07853084861256635, + "language_loss": 0.82891721, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.83959091, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.14599609, + "step": 4750, + "time_per_iteration": 2.6632273197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_mlp": 1.05674314, + "epoch": 0.9140053866871873, + "flos": 690117221376.0, + "grad_norm": 0.20385434890647738, + "language_loss": 0.9033643, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91407716, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.14526367, + "step": 4751, + "time_per_iteration": 2.8812520503997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063694, + "balance_loss_mlp": 1.0491389, + "epoch": 0.9141977683724509, + "flos": 551012336640.0, + "grad_norm": 0.06632997403906014, + "language_loss": 0.84121269, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85184962, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.14550781, + "step": 4752, + "time_per_iteration": 2.742851495742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067674, + "balance_loss_mlp": 1.05316639, + "epoch": 0.9143901500577145, + "flos": 540088943616.0, + "grad_norm": 0.0723562787374101, + "language_loss": 0.7560128, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76668954, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.14489746, + "step": 4753, + "time_per_iteration": 2.6533844470977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066571, + "balance_loss_mlp": 1.05187273, + "epoch": 0.9145825317429781, + "flos": 528767626752.0, + "grad_norm": 0.2806334555286775, + "language_loss": 0.80675328, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.81741893, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.14672852, + "step": 4754, + "time_per_iteration": 2.632373094558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073062, + "balance_loss_mlp": 1.05884004, + "epoch": 0.9147749134282416, + "flos": 514792641024.0, + "grad_norm": 0.06108923695610088, + "language_loss": 0.7887972, + "learning_rate": 1.892702433097776e-05, + "loss": 0.79952776, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.14208984, + "step": 4755, + "time_per_iteration": 2.648470640182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067557, + "balance_loss_mlp": 1.05308533, + "epoch": 0.9149672951135052, + "flos": 514441704960.0, + "grad_norm": 0.07002276071565354, + "language_loss": 0.85469049, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.8653661, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.14453125, + "step": 4756, + "time_per_iteration": 2.65950345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065171, + "balance_loss_mlp": 1.0507822, + "epoch": 0.9151596767987688, + "flos": 577069608960.0, + "grad_norm": 0.06465328138822253, + "language_loss": 0.812971, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.8236227, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.1439209, + "step": 4757, + "time_per_iteration": 2.747659683227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068049, + "balance_loss_mlp": 1.05413735, + "epoch": 0.9153520584840323, + "flos": 619335590400.0, + "grad_norm": 0.07075275740266723, + "language_loss": 0.82515383, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83583432, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.13928223, + "step": 4758, + "time_per_iteration": 2.752819538116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069271, + "balance_loss_mlp": 1.05499017, + "epoch": 0.9155444401692959, + "flos": 468921871872.0, + "grad_norm": 0.11778599796546448, + "language_loss": 0.82900631, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.83969903, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.14294434, + "step": 4759, + "time_per_iteration": 2.6110918521881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008192, + "balance_loss_mlp": 1.00194573, + "epoch": 0.9157368218545594, + "flos": 1410711054336.0, + "grad_norm": 0.006027089947377037, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75827265, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.0625, + "step": 4760, + "time_per_iteration": 4.842655420303345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008593, + "balance_loss_mlp": 1.00234604, + "epoch": 0.915929203539823, + "flos": 1522019040768.0, + "grad_norm": 0.006843317514305485, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80584645, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.0625, + "step": 4761, + "time_per_iteration": 4.96194052696228 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065486, + "balance_loss_mlp": 1.05128801, + "epoch": 0.9161215852250866, + "flos": 535752548352.0, + "grad_norm": 0.06584819530100704, + "language_loss": 0.80398101, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81463587, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.14221191, + "step": 4762, + "time_per_iteration": 2.758117437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066785, + "balance_loss_mlp": 1.05253971, + "epoch": 0.9163139669103502, + "flos": 590624649216.0, + "grad_norm": 0.07549229769112886, + "language_loss": 0.80455124, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81521916, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.14233398, + "step": 4763, + "time_per_iteration": 2.7597572803497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071711, + "balance_loss_mlp": 1.05723906, + "epoch": 0.9165063485956138, + "flos": 821975081472.0, + "grad_norm": 0.05989535024703023, + "language_loss": 0.84422004, + "learning_rate": 1.817043762598397e-05, + "loss": 0.8549372, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.14477539, + "step": 4764, + "time_per_iteration": 3.077842950820923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066851, + "balance_loss_mlp": 1.05241537, + "epoch": 0.9166987302808772, + "flos": 525194772480.0, + "grad_norm": 0.09553183117791494, + "language_loss": 0.8191523, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.82982075, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.14428711, + "step": 4765, + "time_per_iteration": 2.644554376602173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069218, + "balance_loss_mlp": 1.05447185, + "epoch": 0.9168911119661408, + "flos": 655095693312.0, + "grad_norm": 0.06729500236914439, + "language_loss": 0.84236819, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85306036, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.14733887, + "step": 4766, + "time_per_iteration": 2.9453341960906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074214, + "balance_loss_mlp": 1.05980158, + "epoch": 0.9170834936514044, + "flos": 491747314176.0, + "grad_norm": 0.06576753433024131, + "language_loss": 0.84860098, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.85934317, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.14416504, + "step": 4767, + "time_per_iteration": 2.5406041145324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067338, + "balance_loss_mlp": 1.05275846, + "epoch": 0.917275875336668, + "flos": 628040314368.0, + "grad_norm": 0.09157964796114802, + "language_loss": 0.80223978, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81291318, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.14550781, + "step": 4768, + "time_per_iteration": 2.809734344482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010723, + "balance_loss_mlp": 1.00447667, + "epoch": 0.9174682570219315, + "flos": 1517981824512.0, + "grad_norm": 0.007898028987614235, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79190958, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.0625, + "step": 4769, + "time_per_iteration": 4.9082324504852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066063, + "balance_loss_mlp": 1.05191278, + "epoch": 0.917660638707195, + "flos": 560021008896.0, + "grad_norm": 0.06184746471686271, + "language_loss": 0.84936714, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.86002773, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.1418457, + "step": 4770, + "time_per_iteration": 2.679088830947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072014, + "balance_loss_mlp": 1.05756545, + "epoch": 0.9178530203924586, + "flos": 447252751872.0, + "grad_norm": 0.09313381459116095, + "language_loss": 0.83899945, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.84971958, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.14440918, + "step": 4771, + "time_per_iteration": 2.519746780395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065875, + "balance_loss_mlp": 1.05180812, + "epoch": 0.9180454020777222, + "flos": 465981507072.0, + "grad_norm": 0.07078092470079442, + "language_loss": 0.8057059, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81636465, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.140625, + "step": 4772, + "time_per_iteration": 2.573575496673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_mlp": 1.05537605, + "epoch": 0.9182377837629858, + "flos": 596314856448.0, + "grad_norm": 0.08298754862360035, + "language_loss": 0.87202299, + "learning_rate": 1.74290029706784e-05, + "loss": 0.88271654, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.13989258, + "step": 4773, + "time_per_iteration": 2.782898187637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071717, + "balance_loss_mlp": 1.05753124, + "epoch": 0.9184301654482493, + "flos": 996671941632.0, + "grad_norm": 0.06677981987343952, + "language_loss": 0.82528126, + "learning_rate": 1.734755767142876e-05, + "loss": 0.83599842, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.14196777, + "step": 4774, + "time_per_iteration": 3.3350989818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069723, + "balance_loss_mlp": 1.05540562, + "epoch": 0.9186225471335129, + "flos": 508860154368.0, + "grad_norm": 0.07200425768913102, + "language_loss": 0.84860492, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.85930216, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.14306641, + "step": 4775, + "time_per_iteration": 2.7125747203826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067689, + "balance_loss_mlp": 1.05345559, + "epoch": 0.9188149288187765, + "flos": 940423633920.0, + "grad_norm": 0.07577615196138396, + "language_loss": 0.78980851, + "learning_rate": 1.718522925136551e-05, + "loss": 0.80048543, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.14245605, + "step": 4776, + "time_per_iteration": 3.3351941108703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065369, + "balance_loss_mlp": 1.05136228, + "epoch": 0.91900731050404, + "flos": 583674232320.0, + "grad_norm": 0.08146197777200662, + "language_loss": 0.83863878, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.84929252, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.14013672, + "step": 4777, + "time_per_iteration": 2.672926664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067592, + "balance_loss_mlp": 1.05339432, + "epoch": 0.9191996921893035, + "flos": 581213283840.0, + "grad_norm": 0.08031024809047536, + "language_loss": 0.79444981, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.80512571, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.14196777, + "step": 4778, + "time_per_iteration": 2.6956064701080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065323, + "balance_loss_mlp": 1.05133939, + "epoch": 0.9193920738745671, + "flos": 908935686144.0, + "grad_norm": 0.0795713014857256, + "language_loss": 0.79998899, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.81064218, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.13989258, + "step": 4779, + "time_per_iteration": 3.103442430496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010724, + "balance_loss_mlp": 1.00447774, + "epoch": 0.9195844555598307, + "flos": 1558372359168.0, + "grad_norm": 0.00788177819914121, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80806112, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.0625, + "step": 4780, + "time_per_iteration": 4.735037326812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065599, + "balance_loss_mlp": 1.05109096, + "epoch": 0.9197768372450943, + "flos": 474053741568.0, + "grad_norm": 0.07512893938513913, + "language_loss": 0.78746933, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79812533, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.14489746, + "step": 4781, + "time_per_iteration": 2.600867748260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069019, + "balance_loss_mlp": 1.05447555, + "epoch": 0.9199692189303579, + "flos": 857016059904.0, + "grad_norm": 0.07162373169209806, + "language_loss": 0.84339678, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85408694, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.14526367, + "step": 4782, + "time_per_iteration": 3.215178966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070786, + "balance_loss_mlp": 1.05633759, + "epoch": 0.9201616006156214, + "flos": 504390509568.0, + "grad_norm": 0.08066982775893859, + "language_loss": 0.77412266, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78483045, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.14428711, + "step": 4783, + "time_per_iteration": 2.6220128536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065795, + "balance_loss_mlp": 1.05112016, + "epoch": 0.9203539823008849, + "flos": 548781184512.0, + "grad_norm": 0.07094596583832717, + "language_loss": 0.84888017, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.85953808, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.14660645, + "step": 4784, + "time_per_iteration": 2.7147135734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_mlp": 1.04949427, + "epoch": 0.9205463639861485, + "flos": 540004879872.0, + "grad_norm": 0.0697955041806186, + "language_loss": 0.8231988, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83383662, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.14294434, + "step": 4785, + "time_per_iteration": 2.6527657508850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065383, + "balance_loss_mlp": 1.05107796, + "epoch": 0.9207387456714121, + "flos": 799725229056.0, + "grad_norm": 0.07376799317416433, + "language_loss": 0.78239089, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79304475, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.14306641, + "step": 4786, + "time_per_iteration": 3.0446088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068332, + "balance_loss_mlp": 1.05381203, + "epoch": 0.9209311273566756, + "flos": 502848746496.0, + "grad_norm": 0.07061124245304527, + "language_loss": 0.78827488, + "learning_rate": 1.630583198044333e-05, + "loss": 0.79895824, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.14501953, + "step": 4787, + "time_per_iteration": 2.6726601123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069793, + "balance_loss_mlp": 1.05524909, + "epoch": 0.9211235090419392, + "flos": 569323717632.0, + "grad_norm": 0.07225837689316757, + "language_loss": 0.82407451, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83477247, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.1451416, + "step": 4788, + "time_per_iteration": 2.759333372116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070256, + "balance_loss_mlp": 1.05587983, + "epoch": 0.9213158907272028, + "flos": 806549736960.0, + "grad_norm": 0.07835981374467402, + "language_loss": 0.8217482, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.83245075, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.14379883, + "step": 4789, + "time_per_iteration": 3.032362937927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064491, + "balance_loss_mlp": 1.04975629, + "epoch": 0.9215082724124664, + "flos": 490682396160.0, + "grad_norm": 0.07372153379285619, + "language_loss": 0.76175332, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77239823, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.14697266, + "step": 4790, + "time_per_iteration": 2.564042806625366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011897, + "balance_loss_mlp": 1.00565076, + "epoch": 0.9217006540977299, + "flos": 1514495232000.0, + "grad_norm": 0.008243719143982569, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78082162, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.0625, + "step": 4791, + "time_per_iteration": 4.962024211883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107205, + "balance_loss_mlp": 1.05747056, + "epoch": 0.9218930357829934, + "flos": 743793352704.0, + "grad_norm": 0.06990923251195422, + "language_loss": 0.76191884, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77263939, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.14562988, + "step": 4792, + "time_per_iteration": 2.945852756500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067274, + "balance_loss_mlp": 1.05317199, + "epoch": 0.922085417468257, + "flos": 453036934656.0, + "grad_norm": 0.08745531559957272, + "language_loss": 0.80308163, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81375438, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.14086914, + "step": 4793, + "time_per_iteration": 2.528007984161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062006, + "balance_loss_mlp": 1.04761744, + "epoch": 0.9222777991535206, + "flos": 500249405952.0, + "grad_norm": 0.07470871936442788, + "language_loss": 0.85101461, + "learning_rate": 1.575804349061616e-05, + "loss": 0.86163461, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.14367676, + "step": 4794, + "time_per_iteration": 2.5768916606903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069107, + "balance_loss_mlp": 1.0545758, + "epoch": 0.9224701808387842, + "flos": 527959669248.0, + "grad_norm": 0.07688197326977388, + "language_loss": 0.78963321, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.80032432, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.14550781, + "step": 4795, + "time_per_iteration": 2.5921027660369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_mlp": 1.05369949, + "epoch": 0.9226625625240477, + "flos": 874640623104.0, + "grad_norm": 0.06384518887358884, + "language_loss": 0.75556517, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.7662394, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.13757324, + "step": 4796, + "time_per_iteration": 3.133517026901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069777, + "balance_loss_mlp": 1.05577016, + "epoch": 0.9228549442093112, + "flos": 502774594560.0, + "grad_norm": 0.07425958905143133, + "language_loss": 0.87898898, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.88968676, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.14013672, + "step": 4797, + "time_per_iteration": 2.5944347381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058503, + "balance_loss_mlp": 1.04417384, + "epoch": 0.9230473258945748, + "flos": 599989026816.0, + "grad_norm": 0.06949272728529254, + "language_loss": 0.85180724, + "learning_rate": 1.544915681564829e-05, + "loss": 0.86239231, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.14331055, + "step": 4798, + "time_per_iteration": 2.866840362548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059378, + "balance_loss_mlp": 1.04494166, + "epoch": 0.9232397075798384, + "flos": 822508826112.0, + "grad_norm": 0.09329142732010037, + "language_loss": 0.79354167, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.8041355, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.14404297, + "step": 4799, + "time_per_iteration": 3.091614246368408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068353, + "balance_loss_mlp": 1.05402422, + "epoch": 0.923432089265102, + "flos": 707030000640.0, + "grad_norm": 0.08846547031337017, + "language_loss": 0.84656245, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85724592, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.14343262, + "step": 4800, + "time_per_iteration": 2.9078805446624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065128, + "balance_loss_mlp": 1.05058432, + "epoch": 0.9236244709503655, + "flos": 701861054976.0, + "grad_norm": 0.0965298832056426, + "language_loss": 0.76793849, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77858973, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.1451416, + "step": 4801, + "time_per_iteration": 2.9429283142089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067185, + "balance_loss_mlp": 1.05273724, + "epoch": 0.9238168526356291, + "flos": 515039689728.0, + "grad_norm": 0.07355444560642876, + "language_loss": 0.83979952, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85047144, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.14428711, + "step": 4802, + "time_per_iteration": 2.660900592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064512, + "balance_loss_mlp": 1.04980135, + "epoch": 0.9240092343208927, + "flos": 492024098304.0, + "grad_norm": 0.07241857247571085, + "language_loss": 0.81500518, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82565027, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.14697266, + "step": 4803, + "time_per_iteration": 2.5832154750823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106681, + "balance_loss_mlp": 1.05224264, + "epoch": 0.9242016160061562, + "flos": 647218750464.0, + "grad_norm": 0.06962127542934941, + "language_loss": 0.73689508, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.74756318, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.14562988, + "step": 4804, + "time_per_iteration": 2.900785446166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065366, + "balance_loss_mlp": 1.05101275, + "epoch": 0.9243939976914197, + "flos": 729430354944.0, + "grad_norm": 0.07299214948717701, + "language_loss": 0.79122543, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80187905, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.14367676, + "step": 4805, + "time_per_iteration": 2.974085807800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067059, + "balance_loss_mlp": 1.05249214, + "epoch": 0.9245863793766833, + "flos": 452246229504.0, + "grad_norm": 0.09554906471121519, + "language_loss": 0.90652919, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.91719973, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.14550781, + "step": 4806, + "time_per_iteration": 2.6065399646759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066321, + "balance_loss_mlp": 1.05214715, + "epoch": 0.9247787610619469, + "flos": 755030605824.0, + "grad_norm": 0.08136491932055226, + "language_loss": 0.76982534, + "learning_rate": 1.476516966469732e-05, + "loss": 0.78048849, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.1418457, + "step": 4807, + "time_per_iteration": 2.940830945968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066915, + "balance_loss_mlp": 1.05195403, + "epoch": 0.9249711427472105, + "flos": 561928389120.0, + "grad_norm": 0.06417953395011357, + "language_loss": 0.85199314, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.86266232, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.14953613, + "step": 4808, + "time_per_iteration": 2.771059274673462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.05274093, + "epoch": 0.9251635244324741, + "flos": 526699459584.0, + "grad_norm": 0.06608853421706948, + "language_loss": 0.85035574, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86103106, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.14770508, + "step": 4809, + "time_per_iteration": 2.694859266281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064571, + "balance_loss_mlp": 1.05006337, + "epoch": 0.9253559061177375, + "flos": 611280608256.0, + "grad_norm": 0.07734863972631102, + "language_loss": 0.79164314, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80228883, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.14501953, + "step": 4810, + "time_per_iteration": 2.813447952270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008835, + "balance_loss_mlp": 1.00258815, + "epoch": 0.9255482878030011, + "flos": 1551258957312.0, + "grad_norm": 0.006837733446229171, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77934223, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.0625, + "step": 4811, + "time_per_iteration": 4.736983299255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072178, + "balance_loss_mlp": 1.05792069, + "epoch": 0.9257406694882647, + "flos": 766366603776.0, + "grad_norm": 0.08162744233386064, + "language_loss": 0.80772638, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.81844819, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.14245605, + "step": 4812, + "time_per_iteration": 3.054450750350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_mlp": 1.05143118, + "epoch": 0.9259330511735283, + "flos": 497991089664.0, + "grad_norm": 0.08374138374324222, + "language_loss": 0.83075398, + "learning_rate": 1.431765421986686e-05, + "loss": 0.84141165, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.14331055, + "step": 4813, + "time_per_iteration": 2.639411687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067481, + "balance_loss_mlp": 1.05308032, + "epoch": 0.9261254328587919, + "flos": 626874080256.0, + "grad_norm": 0.08153883506876486, + "language_loss": 0.79092741, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80160224, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.14379883, + "step": 4814, + "time_per_iteration": 2.75715708732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067664, + "balance_loss_mlp": 1.0535382, + "epoch": 0.9263178145440554, + "flos": 597382345728.0, + "grad_norm": 0.07799817914897651, + "language_loss": 0.85397398, + "learning_rate": 1.416999056594831e-05, + "loss": 0.86465067, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.14135742, + "step": 4815, + "time_per_iteration": 2.766474723815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068843, + "balance_loss_mlp": 1.05474079, + "epoch": 0.926510196229319, + "flos": 388563319296.0, + "grad_norm": 0.09007822488633566, + "language_loss": 0.83693337, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84762168, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.14099121, + "step": 4816, + "time_per_iteration": 2.4716956615448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067782, + "balance_loss_mlp": 1.05334568, + "epoch": 0.9267025779145825, + "flos": 545798974464.0, + "grad_norm": 0.09455897697825383, + "language_loss": 0.84119844, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85187626, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.14404297, + "step": 4817, + "time_per_iteration": 2.6396780014038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070795, + "balance_loss_mlp": 1.05666864, + "epoch": 0.9268949595998461, + "flos": 499789813248.0, + "grad_norm": 0.06599557905688819, + "language_loss": 0.82125562, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.8319636, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.14135742, + "step": 4818, + "time_per_iteration": 2.636826992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063864, + "balance_loss_mlp": 1.04934382, + "epoch": 0.9270873412851096, + "flos": 432828085248.0, + "grad_norm": 0.07547927657855338, + "language_loss": 0.82790685, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.83854544, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.14501953, + "step": 4819, + "time_per_iteration": 2.6638593673706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065963, + "balance_loss_mlp": 1.05182457, + "epoch": 0.9272797229703732, + "flos": 466769640960.0, + "grad_norm": 0.08095696097618853, + "language_loss": 0.85950172, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87016135, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.14135742, + "step": 4820, + "time_per_iteration": 2.61427640914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065047, + "balance_loss_mlp": 1.05034828, + "epoch": 0.9274721046556368, + "flos": 704838122496.0, + "grad_norm": 0.1143373628903449, + "language_loss": 0.79004455, + "learning_rate": 1.373152729763938e-05, + "loss": 0.800695, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.14672852, + "step": 4821, + "time_per_iteration": 3.046144723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008904, + "balance_loss_mlp": 1.00265718, + "epoch": 0.9276644863409004, + "flos": 1402255950336.0, + "grad_norm": 0.006840762766732248, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83389366, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.0625, + "step": 4822, + "time_per_iteration": 4.890657901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.05101109, + "epoch": 0.927856868026164, + "flos": 741722614272.0, + "grad_norm": 0.07071256665988469, + "language_loss": 0.80128741, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81193984, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.14245605, + "step": 4823, + "time_per_iteration": 3.0425524711608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068987, + "balance_loss_mlp": 1.05459857, + "epoch": 0.9280492497114274, + "flos": 412223883264.0, + "grad_norm": 0.08552550335100627, + "language_loss": 0.73997277, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.75066262, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.14367676, + "step": 4824, + "time_per_iteration": 2.5228898525238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066643, + "balance_loss_mlp": 1.05221891, + "epoch": 0.928241631396691, + "flos": 646504768512.0, + "grad_norm": 0.1058125688728975, + "language_loss": 0.83748496, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.84815139, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.14428711, + "step": 4825, + "time_per_iteration": 2.7949647903442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.0524534, + "epoch": 0.9284340130819546, + "flos": 696855094272.0, + "grad_norm": 0.06762366006389377, + "language_loss": 0.80860943, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81927556, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.14154053, + "step": 4826, + "time_per_iteration": 3.0083041191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_mlp": 1.05130076, + "epoch": 0.9286263947672182, + "flos": 759132062208.0, + "grad_norm": 0.058538603335969094, + "language_loss": 0.83601272, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.84667104, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.14538574, + "step": 4827, + "time_per_iteration": 3.053251266479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106725, + "balance_loss_mlp": 1.05277789, + "epoch": 0.9288187764524817, + "flos": 672823770624.0, + "grad_norm": 0.07416078307622533, + "language_loss": 0.80154818, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81222069, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.14465332, + "step": 4828, + "time_per_iteration": 2.9393198490142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070143, + "balance_loss_mlp": 1.05600464, + "epoch": 0.9290111581377453, + "flos": 500469290496.0, + "grad_norm": 0.08130482924978269, + "language_loss": 0.83759892, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.84830034, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.14135742, + "step": 4829, + "time_per_iteration": 2.5792195796966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008805, + "balance_loss_mlp": 1.00255847, + "epoch": 0.9292035398230089, + "flos": 1563627566592.0, + "grad_norm": 0.006851389377426983, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.7313087, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.0625, + "step": 4830, + "time_per_iteration": 4.982414722442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008797, + "balance_loss_mlp": 1.00255001, + "epoch": 0.9293959215082724, + "flos": 1518673411584.0, + "grad_norm": 0.006852258928936433, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.8052063, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.0625, + "step": 4831, + "time_per_iteration": 4.867983341217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064215, + "balance_loss_mlp": 1.04946923, + "epoch": 0.929588303193536, + "flos": 557836844544.0, + "grad_norm": 0.08964822513269201, + "language_loss": 0.83914268, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.84978479, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.14733887, + "step": 4832, + "time_per_iteration": 2.731626272201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067018, + "balance_loss_mlp": 1.05259395, + "epoch": 0.9297806848787995, + "flos": 478580285952.0, + "grad_norm": 0.08096360124602932, + "language_loss": 0.80279034, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.81346047, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.14416504, + "step": 4833, + "time_per_iteration": 2.5306105613708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069828, + "balance_loss_mlp": 1.05523705, + "epoch": 0.9299730665640631, + "flos": 564537641472.0, + "grad_norm": 0.07716391645519798, + "language_loss": 0.80337012, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.81406838, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.14587402, + "step": 4834, + "time_per_iteration": 2.769162654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067198, + "balance_loss_mlp": 1.05315518, + "epoch": 0.9301654482493267, + "flos": 560174082048.0, + "grad_norm": 0.08258292328826174, + "language_loss": 0.82621527, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83688718, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.14050293, + "step": 4835, + "time_per_iteration": 2.7922258377075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008368, + "balance_loss_mlp": 1.00212157, + "epoch": 0.9303578299345903, + "flos": 1520096606208.0, + "grad_norm": 0.006037144281499386, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77860808, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.0625, + "step": 4836, + "time_per_iteration": 5.028789281845093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106964, + "balance_loss_mlp": 1.05518019, + "epoch": 0.9305502116198537, + "flos": 530843134464.0, + "grad_norm": 0.0847586307722568, + "language_loss": 0.82820576, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.83890218, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.14477539, + "step": 4837, + "time_per_iteration": 2.7545266151428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106575, + "balance_loss_mlp": 1.05152798, + "epoch": 0.9307425933051173, + "flos": 474898775040.0, + "grad_norm": 0.10091492375500201, + "language_loss": 0.81374753, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82440501, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.14221191, + "step": 4838, + "time_per_iteration": 2.5650086402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066794, + "balance_loss_mlp": 1.05262041, + "epoch": 0.9309349749903809, + "flos": 584892223488.0, + "grad_norm": 0.06802795816688911, + "language_loss": 0.86762273, + "learning_rate": 1.245693929549213e-05, + "loss": 0.87829065, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.14172363, + "step": 4839, + "time_per_iteration": 2.7399027347564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067935, + "balance_loss_mlp": 1.05323696, + "epoch": 0.9311273566756445, + "flos": 861666315264.0, + "grad_norm": 0.061246816450390304, + "language_loss": 0.76902699, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.77970636, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.14697266, + "step": 4840, + "time_per_iteration": 3.094343662261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064483, + "balance_loss_mlp": 1.05036891, + "epoch": 0.9313197383609081, + "flos": 548094366720.0, + "grad_norm": 0.07423165545270044, + "language_loss": 0.82531536, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83596015, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.14111328, + "step": 4841, + "time_per_iteration": 2.644543409347534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067111, + "balance_loss_mlp": 1.05234146, + "epoch": 0.9315121200461716, + "flos": 468756315648.0, + "grad_norm": 0.07934461180224898, + "language_loss": 0.80920649, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.81987751, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.14746094, + "step": 4842, + "time_per_iteration": 2.531503200531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010695, + "balance_loss_mlp": 1.05520725, + "epoch": 0.9317045017314352, + "flos": 417659701248.0, + "grad_norm": 0.09122978620676214, + "language_loss": 0.77761424, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.78830928, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.14294434, + "step": 4843, + "time_per_iteration": 2.5434539318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066121, + "balance_loss_mlp": 1.0517211, + "epoch": 0.9318968834166987, + "flos": 540489065472.0, + "grad_norm": 0.07745490601900848, + "language_loss": 0.77002335, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78068453, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.14379883, + "step": 4844, + "time_per_iteration": 2.75764536857605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071049, + "balance_loss_mlp": 1.05657732, + "epoch": 0.9320892651019623, + "flos": 521330452992.0, + "grad_norm": 0.0723719710231409, + "language_loss": 0.80705845, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.81776899, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.14465332, + "step": 4845, + "time_per_iteration": 2.638796329498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066086, + "balance_loss_mlp": 1.05176866, + "epoch": 0.9322816467872258, + "flos": 582072998400.0, + "grad_norm": 0.08462986407031685, + "language_loss": 0.80856788, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.81922877, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.14294434, + "step": 4846, + "time_per_iteration": 2.7654807567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068849, + "balance_loss_mlp": 1.05425739, + "epoch": 0.9324740284724894, + "flos": 484747338240.0, + "grad_norm": 0.06874284999019256, + "language_loss": 0.81851065, + "learning_rate": 1.191013150742537e-05, + "loss": 0.82919914, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.14562988, + "step": 4847, + "time_per_iteration": 2.7250354290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065082, + "balance_loss_mlp": 1.05045485, + "epoch": 0.932666410157753, + "flos": 732585461760.0, + "grad_norm": 0.07610709588397915, + "language_loss": 0.82762969, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.8382805, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.14599609, + "step": 4848, + "time_per_iteration": 3.0495240688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061882, + "balance_loss_mlp": 1.04709959, + "epoch": 0.9328587918430166, + "flos": 965537127936.0, + "grad_norm": 0.0681296459955147, + "language_loss": 0.78606904, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.7966879, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.14758301, + "step": 4849, + "time_per_iteration": 3.258883476257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067459, + "balance_loss_mlp": 1.05304718, + "epoch": 0.9330511735282802, + "flos": 614552085504.0, + "grad_norm": 0.07341597020423554, + "language_loss": 0.8031379, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81381249, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.14416504, + "step": 4850, + "time_per_iteration": 2.7913970947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067673, + "balance_loss_mlp": 1.05315292, + "epoch": 0.9332435552135436, + "flos": 559101823488.0, + "grad_norm": 0.06951719849091244, + "language_loss": 0.85507822, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.8657549, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.14489746, + "step": 4851, + "time_per_iteration": 2.708566427230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063138, + "balance_loss_mlp": 1.04864168, + "epoch": 0.9334359368988072, + "flos": 515536358400.0, + "grad_norm": 0.08442635814792841, + "language_loss": 0.81517386, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.82580519, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.14501953, + "step": 4852, + "time_per_iteration": 2.599057912826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062002, + "balance_loss_mlp": 1.04773307, + "epoch": 0.9336283185840708, + "flos": 539809588224.0, + "grad_norm": 0.09818611570102258, + "language_loss": 0.82896936, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.83958936, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.1427002, + "step": 4853, + "time_per_iteration": 2.7612810134887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008076, + "balance_loss_mlp": 1.00182903, + "epoch": 0.9338207002693344, + "flos": 1562824751616.0, + "grad_norm": 0.0067103682238986335, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79463089, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.06225586, + "step": 4854, + "time_per_iteration": 4.886642694473267 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065823, + "balance_loss_mlp": 1.05126715, + "epoch": 0.9340130819545979, + "flos": 645261811200.0, + "grad_norm": 0.05765482592606577, + "language_loss": 0.81226462, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82292283, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.14550781, + "step": 4855, + "time_per_iteration": 2.954638957977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063009, + "balance_loss_mlp": 1.0485847, + "epoch": 0.9342054636398615, + "flos": 503441588736.0, + "grad_norm": 0.06536242682566035, + "language_loss": 0.76978171, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.78041184, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.14416504, + "step": 4856, + "time_per_iteration": 2.706509828567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068073, + "balance_loss_mlp": 1.05385113, + "epoch": 0.934397845325125, + "flos": 593026126848.0, + "grad_norm": 0.06559583357944072, + "language_loss": 0.84238493, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.85306573, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.14233398, + "step": 4857, + "time_per_iteration": 2.8610572814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068248, + "balance_loss_mlp": 1.05399108, + "epoch": 0.9345902270103886, + "flos": 499891129344.0, + "grad_norm": 0.07255060055784338, + "language_loss": 0.80180097, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.81248355, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.1427002, + "step": 4858, + "time_per_iteration": 2.552783727645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008126, + "balance_loss_mlp": 1.00187981, + "epoch": 0.9347826086956522, + "flos": 1520329347072.0, + "grad_norm": 0.006716178766718662, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.7699585, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.06225586, + "step": 4859, + "time_per_iteration": 4.672068357467651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068402, + "balance_loss_mlp": 1.05360794, + "epoch": 0.9349749903809157, + "flos": 504550923264.0, + "grad_norm": 0.06024273596411253, + "language_loss": 0.81101823, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.8217023, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.14782715, + "step": 4860, + "time_per_iteration": 2.807316541671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065413, + "balance_loss_mlp": 1.05103672, + "epoch": 0.9351673720661793, + "flos": 568901200896.0, + "grad_norm": 0.09837835875374011, + "language_loss": 0.78307474, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.79372889, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.14367676, + "step": 4861, + "time_per_iteration": 2.6451520919799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066536, + "balance_loss_mlp": 1.05227828, + "epoch": 0.9353597537514429, + "flos": 544605576192.0, + "grad_norm": 0.07168847112729194, + "language_loss": 0.86299908, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.8736645, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.1427002, + "step": 4862, + "time_per_iteration": 2.6640143394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060222, + "balance_loss_mlp": 1.04584587, + "epoch": 0.9355521354367065, + "flos": 518997984768.0, + "grad_norm": 0.06539054611804387, + "language_loss": 0.8488996, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85950184, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.14379883, + "step": 4863, + "time_per_iteration": 2.8074288368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068234, + "balance_loss_mlp": 1.05423915, + "epoch": 0.93574451712197, + "flos": 446316314112.0, + "grad_norm": 0.07358102996376413, + "language_loss": 0.7843712, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79505354, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.13989258, + "step": 4864, + "time_per_iteration": 2.5553953647613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066209, + "balance_loss_mlp": 1.0520947, + "epoch": 0.9359368988072335, + "flos": 480517401600.0, + "grad_norm": 0.0873526788853267, + "language_loss": 0.76845741, + "learning_rate": 1.072417553472832e-05, + "loss": 0.77911949, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.14135742, + "step": 4865, + "time_per_iteration": 2.538104295730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064436, + "balance_loss_mlp": 1.05001128, + "epoch": 0.9361292804924971, + "flos": 497118892032.0, + "grad_norm": 0.07011272694309466, + "language_loss": 0.85173476, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86237907, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.1439209, + "step": 4866, + "time_per_iteration": 2.601087808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062752, + "balance_loss_mlp": 1.04833984, + "epoch": 0.9363216621777607, + "flos": 618122368512.0, + "grad_norm": 0.06413466632089472, + "language_loss": 0.84145504, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85208255, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.14416504, + "step": 4867, + "time_per_iteration": 2.771540403366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008072, + "balance_loss_mlp": 1.00182533, + "epoch": 0.9365140438630243, + "flos": 1415929559040.0, + "grad_norm": 0.006714289300712873, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80211407, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.06225586, + "step": 4868, + "time_per_iteration": 4.87546706199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106451, + "balance_loss_mlp": 1.05019283, + "epoch": 0.9367064255482878, + "flos": 590503509504.0, + "grad_norm": 0.07280245758822038, + "language_loss": 0.81747389, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82811898, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.14294434, + "step": 4869, + "time_per_iteration": 2.725703477859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_mlp": 1.05178761, + "epoch": 0.9368988072335513, + "flos": 526637790720.0, + "grad_norm": 0.08059688875946759, + "language_loss": 0.81905812, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.82972252, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.14624023, + "step": 4870, + "time_per_iteration": 2.6715352535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065493, + "balance_loss_mlp": 1.05104446, + "epoch": 0.9370911889188149, + "flos": 743205279744.0, + "grad_norm": 0.07748687060476699, + "language_loss": 0.78810263, + "learning_rate": 1.034252625822113e-05, + "loss": 0.79875755, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.14428711, + "step": 4871, + "time_per_iteration": 2.916724443435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067142, + "balance_loss_mlp": 1.05271745, + "epoch": 0.9372835706040785, + "flos": 546038682624.0, + "grad_norm": 0.07044828072534959, + "language_loss": 0.78682631, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.79749775, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.14404297, + "step": 4872, + "time_per_iteration": 2.6422388553619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065911, + "balance_loss_mlp": 1.05154586, + "epoch": 0.9374759522893421, + "flos": 491633515008.0, + "grad_norm": 0.08201625964477643, + "language_loss": 0.81656861, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82722771, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.14343262, + "step": 4873, + "time_per_iteration": 2.7001724243164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107035, + "balance_loss_mlp": 1.055902, + "epoch": 0.9376683339746056, + "flos": 578421222912.0, + "grad_norm": 0.07557342334959853, + "language_loss": 0.82583576, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83653927, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.14428711, + "step": 4874, + "time_per_iteration": 2.7377657890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065743, + "balance_loss_mlp": 1.05108047, + "epoch": 0.9378607156598692, + "flos": 506290549248.0, + "grad_norm": 0.0827488528047596, + "language_loss": 0.8048206, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81547809, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.1463623, + "step": 4875, + "time_per_iteration": 2.713914632797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_mlp": 1.04972827, + "epoch": 0.9380530973451328, + "flos": 520015915008.0, + "grad_norm": 0.09685567377493352, + "language_loss": 0.77720559, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.7878508, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.14782715, + "step": 4876, + "time_per_iteration": 2.6722288131713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062474, + "balance_loss_mlp": 1.04804993, + "epoch": 0.9382454790303963, + "flos": 557799768576.0, + "grad_norm": 0.06317554593447856, + "language_loss": 0.84759539, + "learning_rate": 9.967720642029999e-06, + "loss": 0.8582201, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.14416504, + "step": 4877, + "time_per_iteration": 2.759575128555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064011, + "balance_loss_mlp": 1.04950309, + "epoch": 0.9384378607156598, + "flos": 695476316160.0, + "grad_norm": 0.07554619114049714, + "language_loss": 0.81792021, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82856029, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.14489746, + "step": 4878, + "time_per_iteration": 2.9286370277404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065892, + "balance_loss_mlp": 1.05136013, + "epoch": 0.9386302424009234, + "flos": 554750747136.0, + "grad_norm": 0.07984933040199384, + "language_loss": 0.80986464, + "learning_rate": 9.844307158203058e-06, + "loss": 0.82052362, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.1451416, + "step": 4879, + "time_per_iteration": 2.6613898277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_mlp": 1.04887724, + "epoch": 0.938822624086187, + "flos": 566981337600.0, + "grad_norm": 0.08367891448674436, + "language_loss": 0.79728901, + "learning_rate": 9.782885847304469e-06, + "loss": 0.80792373, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.14587402, + "step": 4880, + "time_per_iteration": 2.7297160625457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067114, + "balance_loss_mlp": 1.05274892, + "epoch": 0.9390150057714506, + "flos": 417602801664.0, + "grad_norm": 0.07679866362319365, + "language_loss": 0.80293953, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81361073, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.14367676, + "step": 4881, + "time_per_iteration": 2.5838916301727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070302, + "balance_loss_mlp": 1.05568695, + "epoch": 0.9392073874567142, + "flos": 1553839967232.0, + "grad_norm": 0.14673478335081816, + "language_loss": 0.76342237, + "learning_rate": 9.660614206766394e-06, + "loss": 0.7741254, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.14599609, + "step": 4882, + "time_per_iteration": 3.6900463104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068127, + "balance_loss_mlp": 1.05361927, + "epoch": 0.9393997691419776, + "flos": 652536000000.0, + "grad_norm": 0.07340756256614964, + "language_loss": 0.78028488, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79096615, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.14489746, + "step": 4883, + "time_per_iteration": 2.76796817779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100936, + "balance_loss_mlp": 1.00316095, + "epoch": 0.9395921508272412, + "flos": 1553294817792.0, + "grad_norm": 0.00609237494033278, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79180038, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.06201172, + "step": 4884, + "time_per_iteration": 4.855507850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068527, + "balance_loss_mlp": 1.05419779, + "epoch": 0.9397845325125048, + "flos": 498144162816.0, + "grad_norm": 0.06608100725405705, + "language_loss": 0.78651726, + "learning_rate": 9.478634554578314e-06, + "loss": 0.79720247, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.14318848, + "step": 4885, + "time_per_iteration": 2.647665023803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066562, + "balance_loss_mlp": 1.05208969, + "epoch": 0.9399769141977684, + "flos": 498596414976.0, + "grad_norm": 0.07655444770073823, + "language_loss": 0.8362307, + "learning_rate": 9.418355513755638e-06, + "loss": 0.84689629, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.14465332, + "step": 4886, + "time_per_iteration": 2.6135685443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010084, + "balance_loss_mlp": 1.00388551, + "epoch": 0.9401692958830319, + "flos": 1402500427776.0, + "grad_norm": 0.007184013701095998, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80342275, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.06201172, + "step": 4887, + "time_per_iteration": 4.774345397949219 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066371, + "balance_loss_mlp": 1.05168462, + "epoch": 0.9403616775682955, + "flos": 540123448320.0, + "grad_norm": 0.055656393961397786, + "language_loss": 0.84932435, + "learning_rate": 9.298368837495575e-06, + "loss": 0.85998809, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.14660645, + "step": 4888, + "time_per_iteration": 2.833160638809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010135, + "balance_loss_mlp": 1.00388861, + "epoch": 0.9405540592535591, + "flos": 1322058184704.0, + "grad_norm": 0.007188675578583431, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76179576, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.06225586, + "step": 4889, + "time_per_iteration": 4.915827989578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065193, + "balance_loss_mlp": 1.0505538, + "epoch": 0.9407464409388226, + "flos": 572362827264.0, + "grad_norm": 0.08497767098869012, + "language_loss": 0.82881129, + "learning_rate": 9.179144190235799e-06, + "loss": 0.83946323, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.14611816, + "step": 4890, + "time_per_iteration": 2.6498968601226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066088, + "balance_loss_mlp": 1.05141306, + "epoch": 0.9409388226240862, + "flos": 511264203264.0, + "grad_norm": 0.06360484730349225, + "language_loss": 0.76604337, + "learning_rate": 9.119817685386112e-06, + "loss": 0.77670431, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.14648438, + "step": 4891, + "time_per_iteration": 2.7343337535858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010139, + "balance_loss_mlp": 1.00389278, + "epoch": 0.9411312043093497, + "flos": 1569901077504.0, + "grad_norm": 0.00718633131099091, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81252027, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.06225586, + "step": 4892, + "time_per_iteration": 4.940707206726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067624, + "balance_loss_mlp": 1.05330682, + "epoch": 0.9413235859946133, + "flos": 569469450240.0, + "grad_norm": 0.07938482085653319, + "language_loss": 0.78470445, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79538065, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.14318848, + "step": 4893, + "time_per_iteration": 2.739201784133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068375, + "balance_loss_mlp": 1.05412984, + "epoch": 0.9415159676798769, + "flos": 781905747456.0, + "grad_norm": 0.07942221515797811, + "language_loss": 0.80200732, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81269109, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.14257812, + "step": 4894, + "time_per_iteration": 3.002270460128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.05450535, + "epoch": 0.9417083493651405, + "flos": 849341749248.0, + "grad_norm": 0.059340658547535424, + "language_loss": 0.79964054, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81032991, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.14428711, + "step": 4895, + "time_per_iteration": 3.1609625816345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065293, + "balance_loss_mlp": 1.0512265, + "epoch": 0.941900731050404, + "flos": 529333304832.0, + "grad_norm": 0.06768940884435448, + "language_loss": 0.85507524, + "learning_rate": 8.826044268024025e-06, + "loss": 0.86572814, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.14074707, + "step": 4896, + "time_per_iteration": 2.695668935775757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106449, + "balance_loss_mlp": 1.04998195, + "epoch": 0.9420931127356675, + "flos": 557073303552.0, + "grad_norm": 0.2444941012145158, + "language_loss": 0.80151224, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81215715, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.14489746, + "step": 4897, + "time_per_iteration": 2.748248338699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064529, + "balance_loss_mlp": 1.05071259, + "epoch": 0.9422854944209311, + "flos": 652543340544.0, + "grad_norm": 0.0711327665807799, + "language_loss": 0.86498511, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87563032, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.13830566, + "step": 4898, + "time_per_iteration": 2.826136827468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067484, + "balance_loss_mlp": 1.053298, + "epoch": 0.9424778761061947, + "flos": 553685829120.0, + "grad_norm": 0.06562049351455196, + "language_loss": 0.83802789, + "learning_rate": 8.65206832296478e-06, + "loss": 0.84870267, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.14196777, + "step": 4899, + "time_per_iteration": 2.702162027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066084, + "balance_loss_mlp": 1.05165935, + "epoch": 0.9426702577914583, + "flos": 588559053312.0, + "grad_norm": 0.10016132550548382, + "language_loss": 0.79835558, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80901635, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.14416504, + "step": 4900, + "time_per_iteration": 2.7979788780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072147, + "balance_loss_mlp": 1.05753195, + "epoch": 0.9428626394767218, + "flos": 616625021952.0, + "grad_norm": 0.08155692962699897, + "language_loss": 0.78633022, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79705167, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.14611816, + "step": 4901, + "time_per_iteration": 2.8697218894958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067206, + "balance_loss_mlp": 1.05312753, + "epoch": 0.9430550211619854, + "flos": 610410981888.0, + "grad_norm": 0.07461125959373652, + "language_loss": 0.8191936, + "learning_rate": 8.479809201123178e-06, + "loss": 0.82986569, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.140625, + "step": 4902, + "time_per_iteration": 2.756660223007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070532, + "balance_loss_mlp": 1.05659688, + "epoch": 0.943247402847249, + "flos": 565990571520.0, + "grad_norm": 0.08855284632935614, + "language_loss": 0.78214121, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79284656, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.1394043, + "step": 4903, + "time_per_iteration": 2.7230935096740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068043, + "balance_loss_mlp": 1.05367839, + "epoch": 0.9434397845325125, + "flos": 527040483840.0, + "grad_norm": 0.07779943104118621, + "language_loss": 0.81681037, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82749075, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.14343262, + "step": 4904, + "time_per_iteration": 2.6842496395111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065765, + "balance_loss_mlp": 1.05143571, + "epoch": 0.943632166217776, + "flos": 593451214848.0, + "grad_norm": 0.0743178764901382, + "language_loss": 0.8264221, + "learning_rate": 8.309267504391593e-06, + "loss": 0.83707976, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.14318848, + "step": 4905, + "time_per_iteration": 2.7265892028808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063706, + "balance_loss_mlp": 1.04915047, + "epoch": 0.9438245479030396, + "flos": 572770289664.0, + "grad_norm": 0.06559203485836985, + "language_loss": 0.85403311, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86467016, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.14562988, + "step": 4906, + "time_per_iteration": 2.8402488231658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_mlp": 1.04754758, + "epoch": 0.9440169295883032, + "flos": 488258523648.0, + "grad_norm": 0.08035317225981296, + "language_loss": 0.81744617, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82806635, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.14465332, + "step": 4907, + "time_per_iteration": 2.563253164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061471, + "balance_loss_mlp": 1.04690337, + "epoch": 0.9442093112735668, + "flos": 731742999552.0, + "grad_norm": 0.06131941333469913, + "language_loss": 0.73863798, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74925268, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.14550781, + "step": 4908, + "time_per_iteration": 3.048938751220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.05223215, + "epoch": 0.9444016929588304, + "flos": 571031036928.0, + "grad_norm": 0.1315206917141544, + "language_loss": 0.82031131, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83097517, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.14172363, + "step": 4909, + "time_per_iteration": 2.698793411254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066845, + "balance_loss_mlp": 1.05251575, + "epoch": 0.9445940746440938, + "flos": 509292582912.0, + "grad_norm": 0.06866665177014267, + "language_loss": 0.85794264, + "learning_rate": 8.028849459169318e-06, + "loss": 0.86861104, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.14318848, + "step": 4910, + "time_per_iteration": 2.5939254760742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069973, + "balance_loss_mlp": 1.05559599, + "epoch": 0.9447864563293574, + "flos": 624556293120.0, + "grad_norm": 0.07303339072359728, + "language_loss": 0.80941725, + "learning_rate": 7.97333876382028e-06, + "loss": 0.820117, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.14355469, + "step": 4911, + "time_per_iteration": 2.874375820159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066266, + "balance_loss_mlp": 1.05161524, + "epoch": 0.944978838014621, + "flos": 505270047744.0, + "grad_norm": 0.06964507241753962, + "language_loss": 0.80899501, + "learning_rate": 7.918019090162098e-06, + "loss": 0.81965774, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.14648438, + "step": 4912, + "time_per_iteration": 2.760795831680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008957, + "balance_loss_mlp": 1.0027107, + "epoch": 0.9451712196998846, + "flos": 1484205451776.0, + "grad_norm": 0.006122085341861952, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79296297, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.06225586, + "step": 4913, + "time_per_iteration": 5.09798526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067638, + "balance_loss_mlp": 1.05330908, + "epoch": 0.9453636013851482, + "flos": 521137732608.0, + "grad_norm": 0.07285167198538633, + "language_loss": 0.90140414, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91208053, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.14343262, + "step": 4914, + "time_per_iteration": 2.6756272315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008919, + "balance_loss_mlp": 1.00267243, + "epoch": 0.9455559830704117, + "flos": 1496902975488.0, + "grad_norm": 0.006122840187546539, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84571272, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.06225586, + "step": 4915, + "time_per_iteration": 4.950132846832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067573, + "balance_loss_mlp": 1.05333984, + "epoch": 0.9457483647556753, + "flos": 498126910464.0, + "grad_norm": 0.07125264775294483, + "language_loss": 0.81883103, + "learning_rate": 7.698651040865534e-06, + "loss": 0.82950681, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.14233398, + "step": 4916, + "time_per_iteration": 2.6505517959594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060955, + "balance_loss_mlp": 1.04641187, + "epoch": 0.9459407464409388, + "flos": 1019405979648.0, + "grad_norm": 0.06098908339015476, + "language_loss": 0.8214764, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83208597, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.14526367, + "step": 4917, + "time_per_iteration": 3.3748598098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068013, + "balance_loss_mlp": 1.05392301, + "epoch": 0.9461331281262024, + "flos": 513589330944.0, + "grad_norm": 0.11117653680643763, + "language_loss": 0.81199044, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82267058, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.14099121, + "step": 4918, + "time_per_iteration": 2.619296073913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066258, + "balance_loss_mlp": 1.05184555, + "epoch": 0.9463255098114659, + "flos": 528023909376.0, + "grad_norm": 0.09091182296398484, + "language_loss": 0.78226775, + "learning_rate": 7.536131776620936e-06, + "loss": 0.79293031, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.1439209, + "step": 4919, + "time_per_iteration": 2.5946567058563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066597, + "balance_loss_mlp": 1.05230427, + "epoch": 0.9465178914967295, + "flos": 506043500544.0, + "grad_norm": 0.09912025388062279, + "language_loss": 0.83234036, + "learning_rate": 7.482341043430485e-06, + "loss": 0.84300637, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.14294434, + "step": 4920, + "time_per_iteration": 2.630028486251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060019, + "balance_loss_mlp": 1.0456301, + "epoch": 0.9467102731819931, + "flos": 660254727168.0, + "grad_norm": 0.07221122100857683, + "language_loss": 0.8528769, + "learning_rate": 7.428741522553184e-06, + "loss": 0.86347711, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.14379883, + "step": 4921, + "time_per_iteration": 2.894165277481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063149, + "balance_loss_mlp": 1.0486412, + "epoch": 0.9469026548672567, + "flos": 675183403008.0, + "grad_norm": 0.06500736705872397, + "language_loss": 0.89518285, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90581435, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.14489746, + "step": 4922, + "time_per_iteration": 2.938701629638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061564, + "balance_loss_mlp": 1.04743826, + "epoch": 0.9470950365525203, + "flos": 513964859904.0, + "grad_norm": 0.07426983917980619, + "language_loss": 0.79634815, + "learning_rate": 7.32211620090012e-06, + "loss": 0.80696386, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.14135742, + "step": 4923, + "time_per_iteration": 2.650129556655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066205, + "balance_loss_mlp": 1.05188811, + "epoch": 0.9472874182377837, + "flos": 550103063040.0, + "grad_norm": 0.06509608492345216, + "language_loss": 0.81173092, + "learning_rate": 7.269090441520132e-06, + "loss": 0.822393, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.14318848, + "step": 4924, + "time_per_iteration": 2.8149211406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066395, + "balance_loss_mlp": 1.05208969, + "epoch": 0.9474797999230473, + "flos": 542769776640.0, + "grad_norm": 0.06782513775303885, + "language_loss": 0.80087507, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81153905, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.14294434, + "step": 4925, + "time_per_iteration": 2.7232677936553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066702, + "balance_loss_mlp": 1.05213428, + "epoch": 0.9476721816083109, + "flos": 844644879360.0, + "grad_norm": 0.07485474272112004, + "language_loss": 0.85697073, + "learning_rate": 7.163612828585242e-06, + "loss": 0.86763775, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.14562988, + "step": 4926, + "time_per_iteration": 3.1124749183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106154, + "balance_loss_mlp": 1.0473187, + "epoch": 0.9478645632935745, + "flos": 638002676736.0, + "grad_norm": 0.07642573130807323, + "language_loss": 0.79089957, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80151492, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.14233398, + "step": 4927, + "time_per_iteration": 2.7843739986419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.05232859, + "epoch": 0.948056944978838, + "flos": 656832748032.0, + "grad_norm": 0.06890136753931456, + "language_loss": 0.75879681, + "learning_rate": 7.058900559793469e-06, + "loss": 0.7694627, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.1427002, + "step": 4928, + "time_per_iteration": 2.831721544265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064108, + "balance_loss_mlp": 1.04976702, + "epoch": 0.9482493266641016, + "flos": 440907660288.0, + "grad_norm": 0.07279210234186714, + "language_loss": 0.83387977, + "learning_rate": 7.00683148031378e-06, + "loss": 0.84452081, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.14318848, + "step": 4929, + "time_per_iteration": 2.5189461708068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065665, + "balance_loss_mlp": 1.05141914, + "epoch": 0.9484417083493651, + "flos": 545989123584.0, + "grad_norm": 0.10355473964413647, + "language_loss": 0.78032148, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.79097813, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.14245605, + "step": 4930, + "time_per_iteration": 2.784816265106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_mlp": 1.05073011, + "epoch": 0.9486340900346287, + "flos": 538598937600.0, + "grad_norm": 0.05876135972257936, + "language_loss": 0.79680765, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80745637, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.14123535, + "step": 4931, + "time_per_iteration": 2.7050349712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064154, + "balance_loss_mlp": 1.04932451, + "epoch": 0.9488264717198923, + "flos": 681669457920.0, + "grad_norm": 0.07900168224776632, + "language_loss": 0.8563565, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86699808, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.14831543, + "step": 4932, + "time_per_iteration": 2.827993631362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064277, + "balance_loss_mlp": 1.04999626, + "epoch": 0.9490188534051558, + "flos": 462603944448.0, + "grad_norm": 0.08240763026965599, + "language_loss": 0.87754375, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.88818657, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.14257812, + "step": 4933, + "time_per_iteration": 2.552738666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064417, + "balance_loss_mlp": 1.05015934, + "epoch": 0.9492112350904194, + "flos": 543135393792.0, + "grad_norm": 0.06884328604621799, + "language_loss": 0.8272537, + "learning_rate": 6.7493574384489e-06, + "loss": 0.8378979, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.1427002, + "step": 4934, + "time_per_iteration": 2.688225269317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105915, + "balance_loss_mlp": 1.04498768, + "epoch": 0.949403616775683, + "flos": 550322947584.0, + "grad_norm": 0.14362869726847521, + "language_loss": 0.84435534, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85494685, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.14172363, + "step": 4935, + "time_per_iteration": 2.7325098514556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106501, + "balance_loss_mlp": 1.05078828, + "epoch": 0.9495959984609466, + "flos": 598383023616.0, + "grad_norm": 0.08023842745179113, + "language_loss": 0.82742482, + "learning_rate": 6.647708160456678e-06, + "loss": 0.83807492, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.14208984, + "step": 4936, + "time_per_iteration": 2.731147289276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063563, + "balance_loss_mlp": 1.04942477, + "epoch": 0.94978838014621, + "flos": 608409626112.0, + "grad_norm": 0.07231560541851297, + "language_loss": 0.81890976, + "learning_rate": 6.597170816132702e-06, + "loss": 0.82954538, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.14135742, + "step": 4937, + "time_per_iteration": 2.8009979724884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068576, + "balance_loss_mlp": 1.05419946, + "epoch": 0.9499807618314736, + "flos": 540832660992.0, + "grad_norm": 0.06879657649431303, + "language_loss": 0.86649179, + "learning_rate": 6.546825027775427e-06, + "loss": 0.8771776, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.14379883, + "step": 4938, + "time_per_iteration": 2.6949267387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065018, + "balance_loss_mlp": 1.0509038, + "epoch": 0.9501731435167372, + "flos": 594600196608.0, + "grad_norm": 0.066496386986475, + "language_loss": 0.8279618, + "learning_rate": 6.496670814930717e-06, + "loss": 0.83861196, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.14123535, + "step": 4939, + "time_per_iteration": 2.7675018310546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065861, + "balance_loss_mlp": 1.05160344, + "epoch": 0.9503655252020008, + "flos": 454138928640.0, + "grad_norm": 0.07552307901260344, + "language_loss": 0.79926252, + "learning_rate": 6.446708197070161e-06, + "loss": 0.80992115, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.14245605, + "step": 4940, + "time_per_iteration": 2.569943904876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106462, + "balance_loss_mlp": 1.05030322, + "epoch": 0.9505579068872644, + "flos": 667944092160.0, + "grad_norm": 0.07875850796972751, + "language_loss": 0.84661138, + "learning_rate": 6.396937193591079e-06, + "loss": 0.8572576, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.14294434, + "step": 4941, + "time_per_iteration": 2.777996301651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_mlp": 1.0555886, + "epoch": 0.9507502885725279, + "flos": 402207192576.0, + "grad_norm": 0.10996264625691853, + "language_loss": 0.81954122, + "learning_rate": 6.347357823816235e-06, + "loss": 0.8302412, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.14416504, + "step": 4942, + "time_per_iteration": 2.4901835918426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064524, + "balance_loss_mlp": 1.04971838, + "epoch": 0.9509426702577914, + "flos": 700358565888.0, + "grad_norm": 0.15087682626916998, + "language_loss": 0.79449248, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80513769, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.14782715, + "step": 4943, + "time_per_iteration": 2.989339828491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061962, + "balance_loss_mlp": 1.04763293, + "epoch": 0.951135051943055, + "flos": 501415640064.0, + "grad_norm": 0.06965216151492816, + "language_loss": 0.82372928, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83434892, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.14318848, + "step": 4944, + "time_per_iteration": 2.577893018722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062657, + "balance_loss_mlp": 1.0486021, + "epoch": 0.9513274336283186, + "flos": 614621094912.0, + "grad_norm": 0.08955037755265657, + "language_loss": 0.81476396, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82539052, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.140625, + "step": 4945, + "time_per_iteration": 2.924196720123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062106, + "balance_loss_mlp": 1.04747915, + "epoch": 0.9515198153135821, + "flos": 519586057728.0, + "grad_norm": 0.10020891830615615, + "language_loss": 0.81823802, + "learning_rate": 6.150957065611363e-06, + "loss": 0.82885909, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.14599609, + "step": 4946, + "time_per_iteration": 2.582261800765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063398, + "balance_loss_mlp": 1.04886687, + "epoch": 0.9517121969988457, + "flos": 664954168320.0, + "grad_norm": 0.06854803773952908, + "language_loss": 0.76341254, + "learning_rate": 6.102336151595667e-06, + "loss": 0.77404654, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.1451416, + "step": 4947, + "time_per_iteration": 2.958282947540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063649, + "balance_loss_mlp": 1.04943955, + "epoch": 0.9519045786841093, + "flos": 676409107968.0, + "grad_norm": 0.0768160436217087, + "language_loss": 0.7590248, + "learning_rate": 6.053906985658553e-06, + "loss": 0.76966131, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.14208984, + "step": 4948, + "time_per_iteration": 2.82889986038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_mlp": 1.05157638, + "epoch": 0.9520969603693729, + "flos": 652901617152.0, + "grad_norm": 0.08458305561651724, + "language_loss": 0.8030057, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81366104, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.13977051, + "step": 4949, + "time_per_iteration": 2.864802122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066868, + "balance_loss_mlp": 1.05283666, + "epoch": 0.9522893420546364, + "flos": 743284200960.0, + "grad_norm": 0.06703168985120538, + "language_loss": 0.83432692, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84499562, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.14050293, + "step": 4950, + "time_per_iteration": 3.04030179977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062922, + "balance_loss_mlp": 1.04827094, + "epoch": 0.9524817237398999, + "flos": 761696898048.0, + "grad_norm": 0.07636925944960744, + "language_loss": 0.80493855, + "learning_rate": 5.909770163964545e-06, + "loss": 0.81556773, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.14624023, + "step": 4951, + "time_per_iteration": 3.015068292617798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062355, + "balance_loss_mlp": 1.04806209, + "epoch": 0.9526741054251635, + "flos": 529125903360.0, + "grad_norm": 0.09924230241420891, + "language_loss": 0.82117671, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83180022, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.14294434, + "step": 4952, + "time_per_iteration": 2.6157262325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065916, + "balance_loss_mlp": 1.0515151, + "epoch": 0.9528664871104271, + "flos": 488441332224.0, + "grad_norm": 0.0759427350712735, + "language_loss": 0.80944276, + "learning_rate": 5.814638032609787e-06, + "loss": 0.82010198, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.1439209, + "step": 4953, + "time_per_iteration": 2.6230878829956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065708, + "balance_loss_mlp": 1.05156994, + "epoch": 0.9530588687956907, + "flos": 517745115648.0, + "grad_norm": 0.06500419189537737, + "language_loss": 0.85041642, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86107355, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.14160156, + "step": 4954, + "time_per_iteration": 2.739537000656128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063265, + "balance_loss_mlp": 1.04874492, + "epoch": 0.9532512504809542, + "flos": 675148898304.0, + "grad_norm": 0.07799222064442642, + "language_loss": 0.8108077, + "learning_rate": 5.720273340271864e-06, + "loss": 0.82144034, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.14526367, + "step": 4955, + "time_per_iteration": 2.9021482467651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063887, + "balance_loss_mlp": 1.04912901, + "epoch": 0.9534436321662177, + "flos": 489523502592.0, + "grad_norm": 0.1050573421070645, + "language_loss": 0.84418821, + "learning_rate": 5.673378829575249e-06, + "loss": 0.85482705, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.14733887, + "step": 4956, + "time_per_iteration": 2.639496326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064971, + "balance_loss_mlp": 1.05083311, + "epoch": 0.9536360138514813, + "flos": 496585147392.0, + "grad_norm": 0.07729665120468585, + "language_loss": 0.82151657, + "learning_rate": 5.626676233493167e-06, + "loss": 0.83216631, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.14135742, + "step": 4957, + "time_per_iteration": 2.636658191680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062276, + "balance_loss_mlp": 1.04801905, + "epoch": 0.9538283955367449, + "flos": 801462283776.0, + "grad_norm": 0.07611939127300738, + "language_loss": 0.84039545, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85101831, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.14257812, + "step": 4958, + "time_per_iteration": 3.0440261363983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070141, + "balance_loss_mlp": 1.05581212, + "epoch": 0.9540207772220085, + "flos": 556668039168.0, + "grad_norm": 0.06159911525192777, + "language_loss": 0.79893637, + "learning_rate": 5.533846857624203e-06, + "loss": 0.80963778, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.14318848, + "step": 4959, + "time_per_iteration": 2.845899820327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066785, + "balance_loss_mlp": 1.05197954, + "epoch": 0.954213158907272, + "flos": 684505935360.0, + "grad_norm": 0.06494855135175924, + "language_loss": 0.81421417, + "learning_rate": 5.487720113876882e-06, + "loss": 0.82488203, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.14782715, + "step": 4960, + "time_per_iteration": 2.9362258911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065975, + "balance_loss_mlp": 1.05114579, + "epoch": 0.9544055405925356, + "flos": 535752548352.0, + "grad_norm": 0.07321222855176411, + "language_loss": 0.82439888, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83505863, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.14819336, + "step": 4961, + "time_per_iteration": 2.7255775928497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065276, + "balance_loss_mlp": 1.05088735, + "epoch": 0.9545979222777992, + "flos": 825404401152.0, + "grad_norm": 0.06731908344791544, + "language_loss": 0.80610138, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81675416, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.14379883, + "step": 4962, + "time_per_iteration": 3.1223695278167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066006, + "balance_loss_mlp": 1.05186808, + "epoch": 0.9547903039630627, + "flos": 761691755520.0, + "grad_norm": 0.09400179333507447, + "language_loss": 0.77407354, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78473365, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.14147949, + "step": 4963, + "time_per_iteration": 3.0968358516693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068214, + "balance_loss_mlp": 1.05384946, + "epoch": 0.9549826856483262, + "flos": 515306562048.0, + "grad_norm": 0.0716769170625749, + "language_loss": 0.82514, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.83582222, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.14379883, + "step": 4964, + "time_per_iteration": 2.6738553047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106483, + "balance_loss_mlp": 1.0503341, + "epoch": 0.9551750673335898, + "flos": 643107382272.0, + "grad_norm": 0.07735162593831964, + "language_loss": 0.82933629, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83998454, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.14477539, + "step": 4965, + "time_per_iteration": 2.8790547847747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060282, + "balance_loss_mlp": 1.04591751, + "epoch": 0.9553674490188534, + "flos": 472208030208.0, + "grad_norm": 0.07636041436284387, + "language_loss": 0.82715493, + "learning_rate": 5.214991993520546e-06, + "loss": 0.83775771, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.14367676, + "step": 4966, + "time_per_iteration": 2.5930259227752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066923, + "balance_loss_mlp": 1.05236745, + "epoch": 0.955559830704117, + "flos": 528317945856.0, + "grad_norm": 0.08146247068647224, + "language_loss": 0.81637287, + "learning_rate": 5.170209528521763e-06, + "loss": 0.8270421, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.14526367, + "step": 4967, + "time_per_iteration": 2.599799633026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062549, + "balance_loss_mlp": 1.04821956, + "epoch": 0.9557522123893806, + "flos": 548168518656.0, + "grad_norm": 0.13345718857384153, + "language_loss": 0.84123564, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85186112, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.14318848, + "step": 4968, + "time_per_iteration": 2.6432812213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062693, + "balance_loss_mlp": 1.0483048, + "epoch": 0.955944594074644, + "flos": 509465479680.0, + "grad_norm": 0.05812411628153182, + "language_loss": 0.81737351, + "learning_rate": 5.08122094572222e-06, + "loss": 0.82800043, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.14379883, + "step": 4969, + "time_per_iteration": 2.738231897354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.05016541, + "epoch": 0.9561369757599076, + "flos": 527578997760.0, + "grad_norm": 0.11097368256932602, + "language_loss": 0.79432231, + "learning_rate": 5.037014862469824e-06, + "loss": 0.80496752, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.14355469, + "step": 4970, + "time_per_iteration": 2.7720186710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067002, + "balance_loss_mlp": 1.05264878, + "epoch": 0.9563293574451712, + "flos": 498201062400.0, + "grad_norm": 0.17337384612850415, + "language_loss": 0.80255437, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81322438, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.14367676, + "step": 4971, + "time_per_iteration": 2.656113862991333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008577, + "balance_loss_mlp": 1.00233078, + "epoch": 0.9565217391304348, + "flos": 1408875628032.0, + "grad_norm": 0.004243585536307748, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82782137, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.0625, + "step": 4972, + "time_per_iteration": 4.874886512756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_mlp": 1.04655886, + "epoch": 0.9567141208156984, + "flos": 503846853120.0, + "grad_norm": 0.06088753235658507, + "language_loss": 0.78254598, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79315251, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.14086914, + "step": 4973, + "time_per_iteration": 2.7379391193389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064278, + "balance_loss_mlp": 1.04990137, + "epoch": 0.9569065025009619, + "flos": 433213526016.0, + "grad_norm": 0.07892228707042209, + "language_loss": 0.79897404, + "learning_rate": 4.86211231669359e-06, + "loss": 0.8096168, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.14367676, + "step": 4974, + "time_per_iteration": 2.4719619750976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066514, + "balance_loss_mlp": 1.05243528, + "epoch": 0.9570988841862255, + "flos": 589959853056.0, + "grad_norm": 0.07550242888066953, + "language_loss": 0.78395075, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79461586, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.14086914, + "step": 4975, + "time_per_iteration": 2.7846784591674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106779, + "balance_loss_mlp": 1.05294812, + "epoch": 0.957291265871489, + "flos": 767278448640.0, + "grad_norm": 0.08411213981509691, + "language_loss": 0.78761947, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79829735, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.14831543, + "step": 4976, + "time_per_iteration": 2.9675724506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065509, + "balance_loss_mlp": 1.05140674, + "epoch": 0.9574836475567526, + "flos": 639104670720.0, + "grad_norm": 0.08804452126227069, + "language_loss": 0.84846663, + "learning_rate": 4.732953758233849e-06, + "loss": 0.8591218, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.14111328, + "step": 4977, + "time_per_iteration": 2.875070810317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100856, + "balance_loss_mlp": 1.00231338, + "epoch": 0.9576760292420161, + "flos": 1575939649536.0, + "grad_norm": 0.004243788553748721, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79615819, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.0625, + "step": 4978, + "time_per_iteration": 4.965006113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061667, + "balance_loss_mlp": 1.04689729, + "epoch": 0.9578684109272797, + "flos": 496345439232.0, + "grad_norm": 0.06902169609028791, + "language_loss": 0.86979818, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.88041484, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.14746094, + "step": 4979, + "time_per_iteration": 2.626277446746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106511, + "balance_loss_mlp": 1.05060196, + "epoch": 0.9580607926125433, + "flos": 429954531840.0, + "grad_norm": 0.08534432456109216, + "language_loss": 0.85267627, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86332732, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.14489746, + "step": 4980, + "time_per_iteration": 2.471359968185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063533, + "balance_loss_mlp": 1.04926348, + "epoch": 0.9582531742978069, + "flos": 1127262251520.0, + "grad_norm": 0.07445475831229749, + "language_loss": 0.80369455, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81432986, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.1427002, + "step": 4981, + "time_per_iteration": 3.5420055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063708, + "balance_loss_mlp": 1.04910517, + "epoch": 0.9584455559830705, + "flos": 524458395648.0, + "grad_norm": 0.06322023271985078, + "language_loss": 0.7885139, + "learning_rate": 4.521535307661085e-06, + "loss": 0.799151, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.14575195, + "step": 4982, + "time_per_iteration": 2.6688125133514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_mlp": 1.05241847, + "epoch": 0.9586379376683339, + "flos": 634187543040.0, + "grad_norm": 0.0657595850835073, + "language_loss": 0.8091737, + "learning_rate": 4.479828637655392e-06, + "loss": 0.81984484, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.14672852, + "step": 4983, + "time_per_iteration": 2.900023937225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061997, + "balance_loss_mlp": 1.04760838, + "epoch": 0.9588303193535975, + "flos": 416061038592.0, + "grad_norm": 0.07308640529498234, + "language_loss": 0.8356294, + "learning_rate": 4.438314345641459e-06, + "loss": 0.84624934, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.14379883, + "step": 4984, + "time_per_iteration": 2.4941763877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064414, + "balance_loss_mlp": 1.0499779, + "epoch": 0.9590227010388611, + "flos": 481683635712.0, + "grad_norm": 0.07297959005418315, + "language_loss": 0.78085732, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79150152, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.14416504, + "step": 4985, + "time_per_iteration": 2.574579954147339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063094, + "balance_loss_mlp": 1.0491817, + "epoch": 0.9592150827241247, + "flos": 684540440064.0, + "grad_norm": 0.0801232178302707, + "language_loss": 0.80204809, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81267899, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.13916016, + "step": 4986, + "time_per_iteration": 2.9517881870269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060667, + "balance_loss_mlp": 1.0466485, + "epoch": 0.9594074644093882, + "flos": 574490092032.0, + "grad_norm": 0.06970674893323296, + "language_loss": 0.70871252, + "learning_rate": 4.314925898349642e-06, + "loss": 0.71931922, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.14025879, + "step": 4987, + "time_per_iteration": 2.7779877185821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062505, + "balance_loss_mlp": 1.04840255, + "epoch": 0.9595998460946518, + "flos": 546871233024.0, + "grad_norm": 0.0813412690105397, + "language_loss": 0.78303689, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79366195, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.14111328, + "step": 4988, + "time_per_iteration": 2.821676015853882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061716, + "balance_loss_mlp": 1.04733956, + "epoch": 0.9597922277799154, + "flos": 474043829760.0, + "grad_norm": 0.07674089772836457, + "language_loss": 0.78562862, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79624575, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.14367676, + "step": 4989, + "time_per_iteration": 2.5946123600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063896, + "balance_loss_mlp": 1.04971027, + "epoch": 0.9599846094651789, + "flos": 514691324928.0, + "grad_norm": 0.07443176706054339, + "language_loss": 0.8581894, + "learning_rate": 4.193269428723889e-06, + "loss": 0.86882842, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.14196777, + "step": 4990, + "time_per_iteration": 2.659696578979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064136, + "balance_loss_mlp": 1.04966402, + "epoch": 0.9601769911504425, + "flos": 594983066112.0, + "grad_norm": 0.08548186890717813, + "language_loss": 0.78247094, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79311228, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.14477539, + "step": 4991, + "time_per_iteration": 2.785454034805298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106867, + "balance_loss_mlp": 1.05452025, + "epoch": 0.960369372835706, + "flos": 493012293120.0, + "grad_norm": 0.05933531431309235, + "language_loss": 0.79160237, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80228913, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.14160156, + "step": 4992, + "time_per_iteration": 2.6056604385375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106865, + "balance_loss_mlp": 1.05430889, + "epoch": 0.9605617545209696, + "flos": 579293420544.0, + "grad_norm": 0.07427096992859433, + "language_loss": 0.82677233, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83745885, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.14343262, + "step": 4993, + "time_per_iteration": 2.6981287002563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066158, + "balance_loss_mlp": 1.05191231, + "epoch": 0.9607541362062332, + "flos": 927708857856.0, + "grad_norm": 0.05635650765787246, + "language_loss": 0.86224592, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87290752, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.14245605, + "step": 4994, + "time_per_iteration": 3.2580032348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069237, + "balance_loss_mlp": 1.0547291, + "epoch": 0.9609465178914968, + "flos": 573121225728.0, + "grad_norm": 0.06976410259133954, + "language_loss": 0.75687838, + "learning_rate": 3.994358637073036e-06, + "loss": 0.76757073, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.14501953, + "step": 4995, + "time_per_iteration": 2.8269472122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064304, + "balance_loss_mlp": 1.04968929, + "epoch": 0.9611388995767602, + "flos": 530850475008.0, + "grad_norm": 0.1775846949415705, + "language_loss": 0.85502684, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86566985, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.14599609, + "step": 4996, + "time_per_iteration": 2.6431405544281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062028, + "balance_loss_mlp": 1.0472939, + "epoch": 0.9613312812620238, + "flos": 646247808000.0, + "grad_norm": 0.07750261021138917, + "language_loss": 0.81917465, + "learning_rate": 3.916142178097881e-06, + "loss": 0.82979488, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.1472168, + "step": 4997, + "time_per_iteration": 2.7661595344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106511, + "balance_loss_mlp": 1.05091262, + "epoch": 0.9615236629472874, + "flos": 496152718848.0, + "grad_norm": 0.0672683260199148, + "language_loss": 0.77680969, + "learning_rate": 3.877322836288888e-06, + "loss": 0.7874608, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.14208984, + "step": 4998, + "time_per_iteration": 2.8887362480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065659, + "balance_loss_mlp": 1.05123484, + "epoch": 0.961716044632551, + "flos": 512974093824.0, + "grad_norm": 0.0662764042679711, + "language_loss": 0.75444281, + "learning_rate": 3.838696106385153e-06, + "loss": 0.76509941, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.14428711, + "step": 4999, + "time_per_iteration": 2.6276803016662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066483, + "balance_loss_mlp": 1.05201101, + "epoch": 0.9619084263178146, + "flos": 501084527616.0, + "grad_norm": 0.07205618121733878, + "language_loss": 0.80739886, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81806368, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.14453125, + "step": 5000, + "time_per_iteration": 2.603367567062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069579, + "balance_loss_mlp": 1.05550003, + "epoch": 0.9621008080030781, + "flos": 595635379200.0, + "grad_norm": 0.07888618565398942, + "language_loss": 0.74628067, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.75697649, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.14074707, + "step": 5001, + "time_per_iteration": 2.7618119716644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067961, + "balance_loss_mlp": 1.05290532, + "epoch": 0.9622931896883417, + "flos": 502250761728.0, + "grad_norm": 0.10101824369057563, + "language_loss": 0.82141006, + "learning_rate": 3.723971737693899e-06, + "loss": 0.8320896, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.15026855, + "step": 5002, + "time_per_iteration": 2.6244583129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063578, + "balance_loss_mlp": 1.04942787, + "epoch": 0.9624855713736052, + "flos": 607287808512.0, + "grad_norm": 0.08187350631262881, + "language_loss": 0.80840087, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81903666, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.14160156, + "step": 5003, + "time_per_iteration": 2.8215861320495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060243, + "balance_loss_mlp": 1.04593801, + "epoch": 0.9626779530588688, + "flos": 510715777536.0, + "grad_norm": 0.10727098163709863, + "language_loss": 0.84822023, + "learning_rate": 3.648452157695936e-06, + "loss": 0.8588227, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.14306641, + "step": 5004, + "time_per_iteration": 2.6208014488220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064426, + "balance_loss_mlp": 1.05009699, + "epoch": 0.9628703347441323, + "flos": 627294025728.0, + "grad_norm": 0.06354974435142602, + "language_loss": 0.82661265, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83725691, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.14331055, + "step": 5005, + "time_per_iteration": 2.8532235622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067404, + "balance_loss_mlp": 1.05277693, + "epoch": 0.9630627164293959, + "flos": 630758223360.0, + "grad_norm": 0.08206220498729579, + "language_loss": 0.77569473, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78636873, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.14611816, + "step": 5006, + "time_per_iteration": 2.7677853107452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066717, + "balance_loss_mlp": 1.05243576, + "epoch": 0.9632550981146595, + "flos": 570558961152.0, + "grad_norm": 0.062257883589972376, + "language_loss": 0.78452492, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79519212, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.1427002, + "step": 5007, + "time_per_iteration": 2.869426965713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065378, + "balance_loss_mlp": 1.0506556, + "epoch": 0.9634474797999231, + "flos": 466117327872.0, + "grad_norm": 0.07554566875409159, + "language_loss": 0.81106812, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.82172191, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.14709473, + "step": 5008, + "time_per_iteration": 2.672776460647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062497, + "balance_loss_mlp": 1.04800081, + "epoch": 0.9636398614851867, + "flos": 526600714752.0, + "grad_norm": 0.08237727119905165, + "language_loss": 0.85430717, + "learning_rate": 3.463025724284974e-06, + "loss": 0.86493218, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.14489746, + "step": 5009, + "time_per_iteration": 2.628169536590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062003, + "balance_loss_mlp": 1.04785311, + "epoch": 0.9638322431704501, + "flos": 564831677952.0, + "grad_norm": 0.07717205590694699, + "language_loss": 0.75397646, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76459646, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.14160156, + "step": 5010, + "time_per_iteration": 2.793161153793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064984, + "balance_loss_mlp": 1.05066681, + "epoch": 0.9640246248557137, + "flos": 477772328448.0, + "grad_norm": 0.06549027806244842, + "language_loss": 0.84361243, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.85426223, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.14294434, + "step": 5011, + "time_per_iteration": 2.5983877182006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065273, + "balance_loss_mlp": 1.05087256, + "epoch": 0.9642170065409773, + "flos": 539318062080.0, + "grad_norm": 0.0814335313038714, + "language_loss": 0.88396895, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89462173, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.14379883, + "step": 5012, + "time_per_iteration": 2.624086618423462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064149, + "balance_loss_mlp": 1.0500232, + "epoch": 0.9644093882262409, + "flos": 523754325504.0, + "grad_norm": 0.06891136227810866, + "language_loss": 0.83706915, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.84771073, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.14123535, + "step": 5013, + "time_per_iteration": 2.6089134216308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064782, + "balance_loss_mlp": 1.05102515, + "epoch": 0.9646017699115044, + "flos": 574290031104.0, + "grad_norm": 0.08948663754721935, + "language_loss": 0.78484344, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79549122, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.13781738, + "step": 5014, + "time_per_iteration": 2.7396328449249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068726, + "balance_loss_mlp": 1.0543493, + "epoch": 0.964794151596768, + "flos": 636799366656.0, + "grad_norm": 0.08470575991353577, + "language_loss": 0.84187967, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85256696, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.14355469, + "step": 5015, + "time_per_iteration": 2.7370247840881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066055, + "balance_loss_mlp": 1.05163109, + "epoch": 0.9649865332820315, + "flos": 617435550720.0, + "grad_norm": 0.07318591973033871, + "language_loss": 0.86297971, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87364024, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.14404297, + "step": 5016, + "time_per_iteration": 2.7685937881469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063409, + "balance_loss_mlp": 1.04893649, + "epoch": 0.9651789149672951, + "flos": 516183528960.0, + "grad_norm": 0.05982655632152984, + "language_loss": 0.81268066, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82331479, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.14465332, + "step": 5017, + "time_per_iteration": 2.810807228088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064085, + "balance_loss_mlp": 1.05006623, + "epoch": 0.9653712966525587, + "flos": 492940712448.0, + "grad_norm": 0.12263937824557229, + "language_loss": 0.80021322, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.81085408, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.14038086, + "step": 5018, + "time_per_iteration": 2.5756077766418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063054, + "balance_loss_mlp": 1.04867768, + "epoch": 0.9655636783378222, + "flos": 536560505856.0, + "grad_norm": 0.06991309363729381, + "language_loss": 0.82447302, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83510351, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.14355469, + "step": 5019, + "time_per_iteration": 2.784034013748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064572, + "balance_loss_mlp": 1.05060053, + "epoch": 0.9657560600230858, + "flos": 459023749632.0, + "grad_norm": 0.06912097229902868, + "language_loss": 0.82277477, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83342046, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.13977051, + "step": 5020, + "time_per_iteration": 2.6418702602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065145, + "balance_loss_mlp": 1.05097127, + "epoch": 0.9659484417083494, + "flos": 686178749952.0, + "grad_norm": 0.09913081128691836, + "language_loss": 0.83020145, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84085286, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.14160156, + "step": 5021, + "time_per_iteration": 2.809098482131958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007982, + "balance_loss_mlp": 1.00173593, + "epoch": 0.966140823393613, + "flos": 1502292178944.0, + "grad_norm": 0.003272670931099022, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81702226, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.0625, + "step": 5022, + "time_per_iteration": 4.741684198379517 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067318, + "balance_loss_mlp": 1.0534184, + "epoch": 0.9663332050788765, + "flos": 464899336704.0, + "grad_norm": 0.09278307855345254, + "language_loss": 0.81116998, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.82184321, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.13916016, + "step": 5023, + "time_per_iteration": 2.5895280838012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064931, + "balance_loss_mlp": 1.05078137, + "epoch": 0.96652558676414, + "flos": 500834907648.0, + "grad_norm": 0.08225049936543592, + "language_loss": 0.85264218, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.8632915, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.14147949, + "step": 5024, + "time_per_iteration": 2.6449408531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106441, + "balance_loss_mlp": 1.04997373, + "epoch": 0.9667179684494036, + "flos": 424839914496.0, + "grad_norm": 0.098917313378826, + "language_loss": 0.82924014, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.83988422, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.14440918, + "step": 5025, + "time_per_iteration": 2.458395481109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066655, + "balance_loss_mlp": 1.05213523, + "epoch": 0.9669103501346672, + "flos": 516996628992.0, + "grad_norm": 0.09352035120498775, + "language_loss": 0.85764629, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.86831284, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.1451416, + "step": 5026, + "time_per_iteration": 2.673778533935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066519, + "balance_loss_mlp": 1.0520227, + "epoch": 0.9671027318199308, + "flos": 456241600512.0, + "grad_norm": 0.091795487168474, + "language_loss": 0.75505573, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.76572096, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.14489746, + "step": 5027, + "time_per_iteration": 2.6124234199523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065602, + "balance_loss_mlp": 1.05140436, + "epoch": 0.9672951135051943, + "flos": 525058951680.0, + "grad_norm": 0.07445455470407839, + "language_loss": 0.80153406, + "learning_rate": 2.802372171957057e-06, + "loss": 0.81219006, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.14196777, + "step": 5028, + "time_per_iteration": 2.6191561222076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066386, + "balance_loss_mlp": 1.05184281, + "epoch": 0.9674874951904578, + "flos": 573986082816.0, + "grad_norm": 0.4574262707706258, + "language_loss": 0.79723036, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.80789423, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.14526367, + "step": 5029, + "time_per_iteration": 2.830446720123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064711, + "balance_loss_mlp": 1.05021513, + "epoch": 0.9676798768757214, + "flos": 629184153600.0, + "grad_norm": 0.05422559876125122, + "language_loss": 0.79918784, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.80983496, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.14489746, + "step": 5030, + "time_per_iteration": 3.0141375064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007963, + "balance_loss_mlp": 1.00171638, + "epoch": 0.967872258560985, + "flos": 1463880605184.0, + "grad_norm": 0.0032783952713433553, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76571321, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.0625, + "step": 5031, + "time_per_iteration": 4.6728925704956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067077, + "balance_loss_mlp": 1.05293906, + "epoch": 0.9680646402462486, + "flos": 565503814656.0, + "grad_norm": 0.08898799182976663, + "language_loss": 0.79104608, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80171686, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.14160156, + "step": 5032, + "time_per_iteration": 2.70428729057312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007952, + "balance_loss_mlp": 1.00170588, + "epoch": 0.9682570219315121, + "flos": 1434463022592.0, + "grad_norm": 0.003278565206881768, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79082751, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.0625, + "step": 5033, + "time_per_iteration": 4.792613983154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067363, + "balance_loss_mlp": 1.05304611, + "epoch": 0.9684494036167757, + "flos": 584610670080.0, + "grad_norm": 0.07336411283819118, + "language_loss": 0.81745082, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82812446, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.14306641, + "step": 5034, + "time_per_iteration": 2.7284703254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066123, + "balance_loss_mlp": 1.051615, + "epoch": 0.9686417853020393, + "flos": 559064747520.0, + "grad_norm": 0.07168723880196738, + "language_loss": 0.84213465, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.8527959, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.1451416, + "step": 5035, + "time_per_iteration": 2.692577838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106653, + "balance_loss_mlp": 1.05180788, + "epoch": 0.9688341669873028, + "flos": 784927604736.0, + "grad_norm": 0.07458595130597709, + "language_loss": 0.83331645, + "learning_rate": 2.545044165539745e-06, + "loss": 0.8439818, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.14709473, + "step": 5036, + "time_per_iteration": 2.975346326828003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061475, + "balance_loss_mlp": 1.04697919, + "epoch": 0.9690265486725663, + "flos": 395899176960.0, + "grad_norm": 0.5711945724845235, + "language_loss": 0.79369569, + "learning_rate": 2.513747116326126e-06, + "loss": 0.80431038, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.14501953, + "step": 5037, + "time_per_iteration": 2.48323392868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067034, + "balance_loss_mlp": 1.05312204, + "epoch": 0.9692189303578299, + "flos": 476373726720.0, + "grad_norm": 0.07920913629310455, + "language_loss": 0.77461714, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.7852875, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.13916016, + "step": 5038, + "time_per_iteration": 2.7738237380981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063867, + "balance_loss_mlp": 1.04974079, + "epoch": 0.9694113120430935, + "flos": 597575066112.0, + "grad_norm": 0.0715485714109308, + "language_loss": 0.78878641, + "learning_rate": 2.451732453851385e-06, + "loss": 0.79942507, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.14111328, + "step": 5039, + "time_per_iteration": 2.690324306488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106154, + "balance_loss_mlp": 1.04754531, + "epoch": 0.9696036937283571, + "flos": 500881895424.0, + "grad_norm": 0.07794078914679435, + "language_loss": 0.82386857, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.83448398, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.14001465, + "step": 5040, + "time_per_iteration": 2.6172046661376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_mlp": 1.04806352, + "epoch": 0.9697960754136207, + "flos": 432277088256.0, + "grad_norm": 0.08548810268717333, + "language_loss": 0.87234342, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88297331, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.14904785, + "step": 5041, + "time_per_iteration": 2.5740058422088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064207, + "balance_loss_mlp": 1.04924631, + "epoch": 0.9699884570988841, + "flos": 568540353024.0, + "grad_norm": 0.06792244357748194, + "language_loss": 0.85212839, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.8627705, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.14929199, + "step": 5042, + "time_per_iteration": 2.8025379180908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068002, + "balance_loss_mlp": 1.05411386, + "epoch": 0.9701808387841477, + "flos": 516215835648.0, + "grad_norm": 0.0693889530606864, + "language_loss": 0.81386518, + "learning_rate": 2.33002120820458e-06, + "loss": 0.8245452, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.13903809, + "step": 5043, + "time_per_iteration": 2.693671941757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065828, + "balance_loss_mlp": 1.05174971, + "epoch": 0.9703732204694113, + "flos": 491517517824.0, + "grad_norm": 0.08538153831244098, + "language_loss": 0.76105028, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77170855, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.14086914, + "step": 5044, + "time_per_iteration": 2.590811014175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061301, + "balance_loss_mlp": 1.04690051, + "epoch": 0.9705656021546749, + "flos": 626120451072.0, + "grad_norm": 0.07148835916137017, + "language_loss": 0.80247957, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.81309259, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.1439209, + "step": 5045, + "time_per_iteration": 2.781397819519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061296, + "balance_loss_mlp": 1.04675221, + "epoch": 0.9707579838399384, + "flos": 471437148672.0, + "grad_norm": 0.10538235080533889, + "language_loss": 0.83119071, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.84180367, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.1451416, + "step": 5046, + "time_per_iteration": 2.5805857181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067595, + "balance_loss_mlp": 1.05327773, + "epoch": 0.970950365525202, + "flos": 492103019520.0, + "grad_norm": 0.08102393699609502, + "language_loss": 0.80504072, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81571662, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.14318848, + "step": 5047, + "time_per_iteration": 2.6477913856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106484, + "balance_loss_mlp": 1.05048728, + "epoch": 0.9711427472104656, + "flos": 557322923520.0, + "grad_norm": 0.07039537184358946, + "language_loss": 0.80597341, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81662178, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.14343262, + "step": 5048, + "time_per_iteration": 2.7285449504852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065725, + "balance_loss_mlp": 1.05186129, + "epoch": 0.9713351288957291, + "flos": 625841095680.0, + "grad_norm": 0.06565780540592017, + "language_loss": 0.83665466, + "learning_rate": 2.153250946564489e-06, + "loss": 0.84731191, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.13867188, + "step": 5049, + "time_per_iteration": 2.9449574947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066983, + "balance_loss_mlp": 1.05283248, + "epoch": 0.9715275105809927, + "flos": 499073260032.0, + "grad_norm": 0.07689693287405414, + "language_loss": 0.81132668, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.82199657, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.14147949, + "step": 5050, + "time_per_iteration": 2.722886085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063615, + "balance_loss_mlp": 1.0492146, + "epoch": 0.9717198922662562, + "flos": 477515367936.0, + "grad_norm": 0.08005861139557306, + "language_loss": 0.77713883, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78777498, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.1439209, + "step": 5051, + "time_per_iteration": 2.56706166267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106432, + "balance_loss_mlp": 1.0501821, + "epoch": 0.9719122739515198, + "flos": 553446120960.0, + "grad_norm": 0.17410068160573605, + "language_loss": 0.78690982, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79755294, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.14147949, + "step": 5052, + "time_per_iteration": 2.708404302597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066819, + "balance_loss_mlp": 1.05263352, + "epoch": 0.9721046556367834, + "flos": 565852179456.0, + "grad_norm": 0.07134542484886951, + "language_loss": 0.79770613, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.80837435, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.1418457, + "step": 5053, + "time_per_iteration": 2.693629264831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.05225515, + "epoch": 0.972297037322047, + "flos": 560315045376.0, + "grad_norm": 0.0739721255064351, + "language_loss": 0.78349614, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.79416072, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.14196777, + "step": 5054, + "time_per_iteration": 2.7789134979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106497, + "balance_loss_mlp": 1.05055785, + "epoch": 0.9724894190073105, + "flos": 512440349184.0, + "grad_norm": 0.06850307979501671, + "language_loss": 0.79473531, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.80538499, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.14404297, + "step": 5055, + "time_per_iteration": 2.7343311309814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065145, + "balance_loss_mlp": 1.05078018, + "epoch": 0.972681800692574, + "flos": 613832961024.0, + "grad_norm": 0.08293012200245027, + "language_loss": 0.80427051, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81492198, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.14355469, + "step": 5056, + "time_per_iteration": 2.8321659564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067271, + "balance_loss_mlp": 1.05302536, + "epoch": 0.9728741823778376, + "flos": 833911635456.0, + "grad_norm": 0.06937694690471158, + "language_loss": 0.84109455, + "learning_rate": 1.92838141509849e-06, + "loss": 0.8517673, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.14233398, + "step": 5057, + "time_per_iteration": 3.066319465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064993, + "balance_loss_mlp": 1.05038941, + "epoch": 0.9730665640631012, + "flos": 571450982400.0, + "grad_norm": 0.07422141581965605, + "language_loss": 0.84001803, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85066795, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.14611816, + "step": 5058, + "time_per_iteration": 2.7504796981811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061287, + "balance_loss_mlp": 1.04674363, + "epoch": 0.9732589457483648, + "flos": 506520345600.0, + "grad_norm": 0.07368606718448276, + "language_loss": 0.77334303, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78395593, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.1451416, + "step": 5059, + "time_per_iteration": 2.586639404296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065659, + "balance_loss_mlp": 1.05141389, + "epoch": 0.9734513274336283, + "flos": 926977623552.0, + "grad_norm": 0.06998637393077584, + "language_loss": 0.80083954, + "learning_rate": 1.84724562509897e-06, + "loss": 0.81149614, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.14245605, + "step": 5060, + "time_per_iteration": 3.150885820388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061881, + "balance_loss_mlp": 1.04774249, + "epoch": 0.9736437091188919, + "flos": 491930122752.0, + "grad_norm": 0.09572307555688801, + "language_loss": 0.78052622, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79114503, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.14147949, + "step": 5061, + "time_per_iteration": 2.7411611080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067565, + "balance_loss_mlp": 1.05327165, + "epoch": 0.9738360908041555, + "flos": 613321611264.0, + "grad_norm": 0.07503058154901762, + "language_loss": 0.83344877, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.84412444, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.14282227, + "step": 5062, + "time_per_iteration": 2.7299842834472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007973, + "balance_loss_mlp": 1.00167918, + "epoch": 0.974028472489419, + "flos": 1549561549824.0, + "grad_norm": 0.0032860520737355865, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.7700007, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.06298828, + "step": 5063, + "time_per_iteration": 4.974630117416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007973, + "balance_loss_mlp": 1.00167894, + "epoch": 0.9742208541746825, + "flos": 1411155965952.0, + "grad_norm": 0.0032857008369014933, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80685687, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.06298828, + "step": 5064, + "time_per_iteration": 4.946727752685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063186, + "balance_loss_mlp": 1.04902434, + "epoch": 0.9744132358599461, + "flos": 674884597248.0, + "grad_norm": 0.06279093313792176, + "language_loss": 0.76888525, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77951717, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.14160156, + "step": 5065, + "time_per_iteration": 2.8534483909606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066411, + "balance_loss_mlp": 1.05205822, + "epoch": 0.9746056175452097, + "flos": 598407616512.0, + "grad_norm": 0.06682786638723953, + "language_loss": 0.77907526, + "learning_rate": 1.690196122544896e-06, + "loss": 0.78973937, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.14367676, + "step": 5066, + "time_per_iteration": 2.826382637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066072, + "balance_loss_mlp": 1.05201697, + "epoch": 0.9747979992304733, + "flos": 732175428096.0, + "grad_norm": 0.061050992503925997, + "language_loss": 0.82334244, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83400315, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.140625, + "step": 5067, + "time_per_iteration": 3.0268359184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064955, + "balance_loss_mlp": 1.05038726, + "epoch": 0.9749903809157369, + "flos": 616499112960.0, + "grad_norm": 0.08601228130701646, + "language_loss": 0.76389635, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.77454591, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.14550781, + "step": 5068, + "time_per_iteration": 2.7173147201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063536, + "balance_loss_mlp": 1.04950488, + "epoch": 0.9751827626010003, + "flos": 468398039040.0, + "grad_norm": 0.6661715569516079, + "language_loss": 0.83873451, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.84936988, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.14038086, + "step": 5069, + "time_per_iteration": 2.6044535636901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106529, + "balance_loss_mlp": 1.05052018, + "epoch": 0.9753751442862639, + "flos": 599215574016.0, + "grad_norm": 0.21413016416202899, + "language_loss": 0.8517248, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86237764, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.14746094, + "step": 5070, + "time_per_iteration": 2.7918972969055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067126, + "balance_loss_mlp": 1.05289221, + "epoch": 0.9755675259715275, + "flos": 650806285824.0, + "grad_norm": 0.07997280605305106, + "language_loss": 0.81889033, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.82956159, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.14257812, + "step": 5071, + "time_per_iteration": 2.8993237018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064614, + "balance_loss_mlp": 1.05048764, + "epoch": 0.9757599076567911, + "flos": 563658103296.0, + "grad_norm": 0.10641549726057599, + "language_loss": 0.78939104, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80003721, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.14135742, + "step": 5072, + "time_per_iteration": 2.6914937496185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065295, + "balance_loss_mlp": 1.05088246, + "epoch": 0.9759522893420547, + "flos": 504637558272.0, + "grad_norm": 0.07895573194632217, + "language_loss": 0.80287015, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81352311, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.14404297, + "step": 5073, + "time_per_iteration": 2.655817747116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064662, + "balance_loss_mlp": 1.05040479, + "epoch": 0.9761446710273182, + "flos": 583728560640.0, + "grad_norm": 0.07101453734648233, + "language_loss": 0.82085502, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83150166, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.14245605, + "step": 5074, + "time_per_iteration": 2.691014289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064848, + "balance_loss_mlp": 1.05055475, + "epoch": 0.9763370527125818, + "flos": 482207468544.0, + "grad_norm": 0.09505320344300444, + "language_loss": 0.81791657, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.828565, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.14282227, + "step": 5075, + "time_per_iteration": 2.6115615367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_mlp": 1.05314183, + "epoch": 0.9765294343978453, + "flos": 618987225600.0, + "grad_norm": 0.07513940722020379, + "language_loss": 0.78498113, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79565263, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.14025879, + "step": 5076, + "time_per_iteration": 2.7157347202301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064035, + "balance_loss_mlp": 1.04988503, + "epoch": 0.9767218160831089, + "flos": 526573550592.0, + "grad_norm": 0.08975883867321018, + "language_loss": 0.85001129, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86065167, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.14147949, + "step": 5077, + "time_per_iteration": 2.6061902046203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064199, + "balance_loss_mlp": 1.04989409, + "epoch": 0.9769141977683724, + "flos": 525194772480.0, + "grad_norm": 0.10763924777787955, + "language_loss": 0.8412196, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.8518616, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.14294434, + "step": 5078, + "time_per_iteration": 2.6408932209014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069482, + "balance_loss_mlp": 1.05521297, + "epoch": 0.977106579453636, + "flos": 457615236096.0, + "grad_norm": 0.08732182251457153, + "language_loss": 0.80499446, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81568927, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.14257812, + "step": 5079, + "time_per_iteration": 2.821551561355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066592, + "balance_loss_mlp": 1.05263269, + "epoch": 0.9772989611388996, + "flos": 532090861056.0, + "grad_norm": 0.06587331021593097, + "language_loss": 0.81444585, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82511181, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.13964844, + "step": 5080, + "time_per_iteration": 2.6979024410247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106446, + "balance_loss_mlp": 1.05042887, + "epoch": 0.9774913428241632, + "flos": 755349235200.0, + "grad_norm": 0.06897133924959288, + "language_loss": 0.86031032, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.87095487, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.14025879, + "step": 5081, + "time_per_iteration": 3.041377544403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007931, + "balance_loss_mlp": 1.0016849, + "epoch": 0.9776837245094268, + "flos": 1554320088576.0, + "grad_norm": 0.0032836554986033295, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79903424, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.0625, + "step": 5082, + "time_per_iteration": 4.9710633754730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065578, + "balance_loss_mlp": 1.05097508, + "epoch": 0.9778761061946902, + "flos": 592534600704.0, + "grad_norm": 0.09877272311623977, + "language_loss": 0.83793986, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.84859562, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.14599609, + "step": 5083, + "time_per_iteration": 2.6839962005615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.05211425, + "epoch": 0.9780684878799538, + "flos": 414951704064.0, + "grad_norm": 0.08910611593558808, + "language_loss": 0.82052761, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.83119166, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.14282227, + "step": 5084, + "time_per_iteration": 2.4916131496429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.05482697, + "epoch": 0.9782608695652174, + "flos": 568411872768.0, + "grad_norm": 0.0683698369039321, + "language_loss": 0.84943771, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.86012912, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.14331055, + "step": 5085, + "time_per_iteration": 2.727896213531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064723, + "balance_loss_mlp": 1.05031037, + "epoch": 0.978453251250481, + "flos": 690472926720.0, + "grad_norm": 0.08017894045097873, + "language_loss": 0.82961535, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84026265, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.14416504, + "step": 5086, + "time_per_iteration": 2.8707313537597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069677, + "balance_loss_mlp": 1.05558658, + "epoch": 0.9786456329357445, + "flos": 502505150976.0, + "grad_norm": 0.0603639816123071, + "language_loss": 0.77216703, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.7828638, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.14086914, + "step": 5087, + "time_per_iteration": 2.6551520824432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062505, + "balance_loss_mlp": 1.04825974, + "epoch": 0.9788380146210081, + "flos": 863183485440.0, + "grad_norm": 0.08415951244120122, + "language_loss": 0.80294234, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.8135674, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.14257812, + "step": 5088, + "time_per_iteration": 3.0204029083251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068699, + "balance_loss_mlp": 1.05485845, + "epoch": 0.9790303963062716, + "flos": 512717133312.0, + "grad_norm": 0.07232346891322454, + "language_loss": 0.84229541, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85298246, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.1385498, + "step": 5089, + "time_per_iteration": 2.6450371742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068262, + "balance_loss_mlp": 1.05395687, + "epoch": 0.9792227779915352, + "flos": 494428147200.0, + "grad_norm": 0.08922991486466687, + "language_loss": 0.86236167, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87304425, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.14318848, + "step": 5090, + "time_per_iteration": 2.6085898876190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062554, + "balance_loss_mlp": 1.04827309, + "epoch": 0.9794151596767988, + "flos": 608325562368.0, + "grad_norm": 0.06779358783176108, + "language_loss": 0.81499738, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82562292, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.14282227, + "step": 5091, + "time_per_iteration": 2.785670280456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106137, + "balance_loss_mlp": 1.04681492, + "epoch": 0.9796075413620623, + "flos": 478222009344.0, + "grad_norm": 0.07345107204031283, + "language_loss": 0.86748743, + "learning_rate": 1.09015417612357e-06, + "loss": 0.87810111, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.14550781, + "step": 5092, + "time_per_iteration": 2.6004750728607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_mlp": 1.04863238, + "epoch": 0.9797999230473259, + "flos": 592220740608.0, + "grad_norm": 0.06917655152428695, + "language_loss": 0.84302372, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85365528, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.1451416, + "step": 5093, + "time_per_iteration": 2.8177921772003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106471, + "balance_loss_mlp": 1.0502739, + "epoch": 0.9799923047325895, + "flos": 556381343232.0, + "grad_norm": 0.06567011725457712, + "language_loss": 0.81470811, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82535523, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.14440918, + "step": 5094, + "time_per_iteration": 2.9339916706085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106685, + "balance_loss_mlp": 1.05187774, + "epoch": 0.9801846864178531, + "flos": 579456405504.0, + "grad_norm": 0.06290617495245203, + "language_loss": 0.84237778, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85304636, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.14953613, + "step": 5095, + "time_per_iteration": 2.7596583366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063683, + "balance_loss_mlp": 1.04974759, + "epoch": 0.9803770681031165, + "flos": 515101358592.0, + "grad_norm": 0.05884649286884671, + "language_loss": 0.79774284, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.80837965, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.1394043, + "step": 5096, + "time_per_iteration": 2.717756509780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.05201113, + "epoch": 0.9805694497883801, + "flos": 566988678144.0, + "grad_norm": 0.06510968800704982, + "language_loss": 0.78243887, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79309964, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.14074707, + "step": 5097, + "time_per_iteration": 2.7859761714935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062526, + "balance_loss_mlp": 1.04820871, + "epoch": 0.9807618314736437, + "flos": 479351167488.0, + "grad_norm": 0.06784455696398038, + "language_loss": 0.7347126, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74533784, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.14318848, + "step": 5098, + "time_per_iteration": 2.704630136489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061768, + "balance_loss_mlp": 1.0477488, + "epoch": 0.9809542131589073, + "flos": 545285053440.0, + "grad_norm": 0.11478990612033974, + "language_loss": 0.79899949, + "learning_rate": 9.509698444908344e-07, + "loss": 0.80961716, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.14038086, + "step": 5099, + "time_per_iteration": 2.6292612552642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065944, + "balance_loss_mlp": 1.05166292, + "epoch": 0.9811465948441709, + "flos": 520843696128.0, + "grad_norm": 0.07093256961934312, + "language_loss": 0.79454851, + "learning_rate": 9.318612999057452e-07, + "loss": 0.80520797, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.14282227, + "step": 5100, + "time_per_iteration": 2.605419874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.05283618, + "epoch": 0.9813389765294344, + "flos": 541282341888.0, + "grad_norm": 0.07637881185525433, + "language_loss": 0.80382729, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81450266, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.14672852, + "step": 5101, + "time_per_iteration": 2.6618900299072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062636, + "balance_loss_mlp": 1.04866457, + "epoch": 0.981531358214698, + "flos": 567356866560.0, + "grad_norm": 0.07326205712119045, + "language_loss": 0.84316814, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85379446, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.13989258, + "step": 5102, + "time_per_iteration": 2.7000365257263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065302, + "balance_loss_mlp": 1.05118787, + "epoch": 0.9817237398999615, + "flos": 577272241152.0, + "grad_norm": 0.07352728739479987, + "language_loss": 0.80912358, + "learning_rate": 8.756982280578307e-07, + "loss": 0.81977654, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.14123535, + "step": 5103, + "time_per_iteration": 2.716947555541992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063153, + "balance_loss_mlp": 1.04894328, + "epoch": 0.9819161215852251, + "flos": 701507547648.0, + "grad_norm": 0.0812537946664224, + "language_loss": 0.8192457, + "learning_rate": 8.573647489714676e-07, + "loss": 0.82987726, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.14208984, + "step": 5104, + "time_per_iteration": 2.9482638835906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067939, + "balance_loss_mlp": 1.05362189, + "epoch": 0.9821085032704886, + "flos": 624188104704.0, + "grad_norm": 0.0735501937900119, + "language_loss": 0.84292555, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85360503, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.14306641, + "step": 5105, + "time_per_iteration": 2.8968729972839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062589, + "balance_loss_mlp": 1.04799807, + "epoch": 0.9823008849557522, + "flos": 499505688576.0, + "grad_norm": 0.07164543786345488, + "language_loss": 0.8119458, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82257169, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.14587402, + "step": 5106, + "time_per_iteration": 2.7407009601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064272, + "balance_loss_mlp": 1.04980028, + "epoch": 0.9824932666410158, + "flos": 523815994368.0, + "grad_norm": 0.08625390255537382, + "language_loss": 0.72545767, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73610038, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.14453125, + "step": 5107, + "time_per_iteration": 2.7165608406066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.05325365, + "epoch": 0.9826856483262794, + "flos": 502663366656.0, + "grad_norm": 0.0719645103131503, + "language_loss": 0.8213681, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83204341, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.14282227, + "step": 5108, + "time_per_iteration": 2.6449546813964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065727, + "balance_loss_mlp": 1.05152941, + "epoch": 0.982878030011543, + "flos": 562056869376.0, + "grad_norm": 0.06031610149525448, + "language_loss": 0.84049594, + "learning_rate": 7.686042586151354e-07, + "loss": 0.85115325, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.14196777, + "step": 5109, + "time_per_iteration": 2.8201980590820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106353, + "balance_loss_mlp": 1.04928493, + "epoch": 0.9830704116968064, + "flos": 537101591040.0, + "grad_norm": 0.06932231070065256, + "language_loss": 0.82637227, + "learning_rate": 7.514335898027857e-07, + "loss": 0.83700758, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.14245605, + "step": 5110, + "time_per_iteration": 2.7956700325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063874, + "balance_loss_mlp": 1.04966426, + "epoch": 0.98326279338207, + "flos": 458949597696.0, + "grad_norm": 0.06270744852863061, + "language_loss": 0.84185314, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85249186, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.14221191, + "step": 5111, + "time_per_iteration": 2.526143789291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063111, + "balance_loss_mlp": 1.04903221, + "epoch": 0.9834551750673336, + "flos": 640974974976.0, + "grad_norm": 0.06650494434915036, + "language_loss": 0.79163671, + "learning_rate": 7.17673735218416e-07, + "loss": 0.80226785, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.14074707, + "step": 5112, + "time_per_iteration": 2.8292341232299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061423, + "balance_loss_mlp": 1.04775, + "epoch": 0.9836475567525972, + "flos": 1071807220224.0, + "grad_norm": 0.07946110892144641, + "language_loss": 0.79060733, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80122155, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.13696289, + "step": 5113, + "time_per_iteration": 3.4044573307037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066495, + "balance_loss_mlp": 1.05227315, + "epoch": 0.9838399384378607, + "flos": 565209778176.0, + "grad_norm": 0.09866362357616712, + "language_loss": 0.75764799, + "learning_rate": 6.846892349181566e-07, + "loss": 0.76831293, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.14221191, + "step": 5114, + "time_per_iteration": 2.724730968475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063835, + "balance_loss_mlp": 1.04963779, + "epoch": 0.9840323201231242, + "flos": 772805670912.0, + "grad_norm": 0.0911355229399916, + "language_loss": 0.79936361, + "learning_rate": 6.684877586787819e-07, + "loss": 0.81000197, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.14208984, + "step": 5115, + "time_per_iteration": 3.0147950649261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064109, + "balance_loss_mlp": 1.04974413, + "epoch": 0.9842247018083878, + "flos": 472262358528.0, + "grad_norm": 0.10523121781623718, + "language_loss": 0.85520661, + "learning_rate": 6.524801401249225e-07, + "loss": 0.86584771, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.14367676, + "step": 5116, + "time_per_iteration": 2.5995094776153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064507, + "balance_loss_mlp": 1.05032063, + "epoch": 0.9844170834936514, + "flos": 525259012608.0, + "grad_norm": 0.07203158366187926, + "language_loss": 0.84932005, + "learning_rate": 6.366663854713295e-07, + "loss": 0.85996509, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.14196777, + "step": 5117, + "time_per_iteration": 2.637052297592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007922, + "balance_loss_mlp": 1.00167584, + "epoch": 0.984609465178915, + "flos": 1567247408640.0, + "grad_norm": 0.0032849089870143, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78170443, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.0625, + "step": 5118, + "time_per_iteration": 4.90720272064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068707, + "balance_loss_mlp": 1.0540328, + "epoch": 0.9848018468641785, + "flos": 519548981760.0, + "grad_norm": 0.09557736917405237, + "language_loss": 0.82077289, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83145994, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.14672852, + "step": 5119, + "time_per_iteration": 2.6469926834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_mlp": 1.05113006, + "epoch": 0.9849942285494421, + "flos": 493004952576.0, + "grad_norm": 0.07479661629278153, + "language_loss": 0.82782626, + "learning_rate": 5.903883659301167e-07, + "loss": 0.83847886, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.14147949, + "step": 5120, + "time_per_iteration": 2.576946973800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066832, + "balance_loss_mlp": 1.05213332, + "epoch": 0.9851866102347057, + "flos": 546001606656.0, + "grad_norm": 0.08446497011390579, + "language_loss": 0.80810666, + "learning_rate": 5.753501275193029e-07, + "loss": 0.81877494, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.14685059, + "step": 5121, + "time_per_iteration": 2.6319987773895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063985, + "balance_loss_mlp": 1.04960883, + "epoch": 0.9853789919199692, + "flos": 476257729536.0, + "grad_norm": 0.07681446659102178, + "language_loss": 0.80095053, + "learning_rate": 5.605057829531912e-07, + "loss": 0.81159031, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.14355469, + "step": 5122, + "time_per_iteration": 2.5240464210510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061639, + "balance_loss_mlp": 1.04738188, + "epoch": 0.9855713736052328, + "flos": 1032619995648.0, + "grad_norm": 0.08827178594358556, + "language_loss": 0.76197588, + "learning_rate": 5.458553379950049e-07, + "loss": 0.77259231, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.14245605, + "step": 5123, + "time_per_iteration": 3.3713111877441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063782, + "balance_loss_mlp": 1.04914308, + "epoch": 0.9857637552904963, + "flos": 495050724864.0, + "grad_norm": 0.06078629887219036, + "language_loss": 0.82555091, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83618873, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.14611816, + "step": 5124, + "time_per_iteration": 2.6111574172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_mlp": 1.05182135, + "epoch": 0.9859561369757599, + "flos": 592267728384.0, + "grad_norm": 0.083267958532, + "language_loss": 0.83494437, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84560442, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.1418457, + "step": 5125, + "time_per_iteration": 2.6993632316589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065376, + "balance_loss_mlp": 1.05084407, + "epoch": 0.9861485186610235, + "flos": 486971149824.0, + "grad_norm": 0.07420331349038331, + "language_loss": 0.78526759, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79592133, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.1451416, + "step": 5126, + "time_per_iteration": 2.6422102451324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063683, + "balance_loss_mlp": 1.04927087, + "epoch": 0.9863409003462871, + "flos": 518795352576.0, + "grad_norm": 0.058719013757474826, + "language_loss": 0.82536149, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83599836, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.14404297, + "step": 5127, + "time_per_iteration": 2.7151970863342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007972, + "balance_loss_mlp": 1.00167775, + "epoch": 0.9865332820315506, + "flos": 1486026570240.0, + "grad_norm": 0.003284256404778656, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80190706, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.06298828, + "step": 5128, + "time_per_iteration": 4.89760160446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067477, + "balance_loss_mlp": 1.05338657, + "epoch": 0.9867256637168141, + "flos": 582112645632.0, + "grad_norm": 0.06977988742925464, + "language_loss": 0.78998387, + "learning_rate": 4.620248732582488e-07, + "loss": 0.80065858, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.14111328, + "step": 5129, + "time_per_iteration": 2.7023425102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063752, + "balance_loss_mlp": 1.05001903, + "epoch": 0.9869180454020777, + "flos": 959303264256.0, + "grad_norm": 0.1397619668456288, + "language_loss": 0.86259735, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87323487, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.13757324, + "step": 5130, + "time_per_iteration": 3.240145444869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065812, + "balance_loss_mlp": 1.05161428, + "epoch": 0.9871104270873413, + "flos": 770730163200.0, + "grad_norm": 0.07001869751455264, + "language_loss": 0.82417846, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.8348366, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.14196777, + "step": 5131, + "time_per_iteration": 3.044957399368286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063083, + "balance_loss_mlp": 1.04839683, + "epoch": 0.9873028087726049, + "flos": 446444794368.0, + "grad_norm": 0.08295760254842617, + "language_loss": 0.77687156, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.78750235, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.14672852, + "step": 5132, + "time_per_iteration": 2.5449488162994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063168, + "balance_loss_mlp": 1.04895806, + "epoch": 0.9874951904578684, + "flos": 507612427776.0, + "grad_norm": 0.06787160975467058, + "language_loss": 0.86360145, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87423307, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.14221191, + "step": 5133, + "time_per_iteration": 2.5786757469177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070832, + "balance_loss_mlp": 1.05663383, + "epoch": 0.987687572143132, + "flos": 716742743040.0, + "grad_norm": 0.07149794752795115, + "language_loss": 0.82624304, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83695138, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.14196777, + "step": 5134, + "time_per_iteration": 2.9011013507843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007975, + "balance_loss_mlp": 1.00168061, + "epoch": 0.9878799538283956, + "flos": 1538647695360.0, + "grad_norm": 0.0032849775675939607, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80825925, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.06298828, + "step": 5135, + "time_per_iteration": 4.909507989883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068183, + "balance_loss_mlp": 1.0540688, + "epoch": 0.9880723355136591, + "flos": 721424931840.0, + "grad_norm": 0.06089333863399881, + "language_loss": 0.81941283, + "learning_rate": 3.730469030412964e-07, + "loss": 0.83009458, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.14111328, + "step": 5136, + "time_per_iteration": 2.9317398071289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070456, + "balance_loss_mlp": 1.05611491, + "epoch": 0.9882647171989226, + "flos": 557350087680.0, + "grad_norm": 0.06444358386944021, + "language_loss": 0.84564662, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85635114, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.14318848, + "step": 5137, + "time_per_iteration": 2.7681379318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068262, + "balance_loss_mlp": 1.05404043, + "epoch": 0.9884570988841862, + "flos": 562820410368.0, + "grad_norm": 0.08053683664726487, + "language_loss": 0.80556041, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81624299, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.14221191, + "step": 5138, + "time_per_iteration": 2.717684745788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.04932356, + "epoch": 0.9886494805694498, + "flos": 431763167232.0, + "grad_norm": 0.08261079522387915, + "language_loss": 0.86220396, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87284046, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.14331055, + "step": 5139, + "time_per_iteration": 2.5395169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062583, + "balance_loss_mlp": 1.0483849, + "epoch": 0.9888418622547134, + "flos": 592082348544.0, + "grad_norm": 0.06860715832060843, + "language_loss": 0.9065218, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91714764, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.14196777, + "step": 5140, + "time_per_iteration": 2.7795023918151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064018, + "balance_loss_mlp": 1.04983258, + "epoch": 0.989034243939977, + "flos": 1134993461760.0, + "grad_norm": 0.06501878565104381, + "language_loss": 0.80251312, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81315333, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.1418457, + "step": 5141, + "time_per_iteration": 3.5281801223754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067707, + "balance_loss_mlp": 1.05318689, + "epoch": 0.9892266256252404, + "flos": 566670048768.0, + "grad_norm": 0.0888536898085742, + "language_loss": 0.82055813, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.83123523, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.14501953, + "step": 5142, + "time_per_iteration": 2.697338104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066844, + "balance_loss_mlp": 1.05258632, + "epoch": 0.989419007310504, + "flos": 640577051136.0, + "grad_norm": 0.07447901058049321, + "language_loss": 0.84180474, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.85247314, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.14257812, + "step": 5143, + "time_per_iteration": 2.9698703289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065942, + "balance_loss_mlp": 1.05180383, + "epoch": 0.9896113889957676, + "flos": 455478059520.0, + "grad_norm": 0.07747172431419576, + "language_loss": 0.81499732, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82565677, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.14135742, + "step": 5144, + "time_per_iteration": 2.6792209148406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_mlp": 1.05051708, + "epoch": 0.9898037706810312, + "flos": 567339614208.0, + "grad_norm": 0.0871849348343538, + "language_loss": 0.80570358, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81635183, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.14306641, + "step": 5145, + "time_per_iteration": 2.656355142593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007929, + "balance_loss_mlp": 1.0016824, + "epoch": 0.9899961523662947, + "flos": 1550268191232.0, + "grad_norm": 0.0032846387566116595, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79154348, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.0625, + "step": 5146, + "time_per_iteration": 4.951949834823608 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065686, + "balance_loss_mlp": 1.05124998, + "epoch": 0.9901885340515583, + "flos": 610709787648.0, + "grad_norm": 0.06900468674588564, + "language_loss": 0.85261124, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86326808, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.14428711, + "step": 5147, + "time_per_iteration": 2.8891849517822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061196, + "balance_loss_mlp": 1.04692626, + "epoch": 0.9903809157368219, + "flos": 517483385856.0, + "grad_norm": 0.06704626028114179, + "language_loss": 0.82689345, + "learning_rate": 2.426269020866512e-07, + "loss": 0.83750546, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.14257812, + "step": 5148, + "time_per_iteration": 2.569988965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068071, + "balance_loss_mlp": 1.05389738, + "epoch": 0.9905732974220854, + "flos": 1100426757120.0, + "grad_norm": 0.06984824296340629, + "language_loss": 0.8062039, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81688464, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.1418457, + "step": 5149, + "time_per_iteration": 3.4324331283569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106702, + "balance_loss_mlp": 1.05286968, + "epoch": 0.990765679107349, + "flos": 858002056704.0, + "grad_norm": 0.08001176069613011, + "language_loss": 0.84435785, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85502803, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.14147949, + "step": 5150, + "time_per_iteration": 3.1345553398132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063045, + "balance_loss_mlp": 1.04887056, + "epoch": 0.9909580607926125, + "flos": 491287721472.0, + "grad_norm": 0.07724815370290013, + "language_loss": 0.80111492, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81174541, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.14172363, + "step": 5151, + "time_per_iteration": 2.584542751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064273, + "balance_loss_mlp": 1.04981256, + "epoch": 0.9911504424778761, + "flos": 585060350976.0, + "grad_norm": 0.06559857995855996, + "language_loss": 0.79478276, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80542547, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.14440918, + "step": 5152, + "time_per_iteration": 2.7145586013793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062084, + "balance_loss_mlp": 1.04769564, + "epoch": 0.9913428241631397, + "flos": 570030359040.0, + "grad_norm": 0.07497960334620508, + "language_loss": 0.81697887, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.8275997, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.14379883, + "step": 5153, + "time_per_iteration": 2.6985859870910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066223, + "balance_loss_mlp": 1.05188227, + "epoch": 0.9915352058484033, + "flos": 489745958400.0, + "grad_norm": 0.0690511487953324, + "language_loss": 0.85916805, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.86983025, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.14343262, + "step": 5154, + "time_per_iteration": 2.670865058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063538, + "balance_loss_mlp": 1.04945898, + "epoch": 0.9917275875336667, + "flos": 744047741952.0, + "grad_norm": 0.06727486345709939, + "language_loss": 0.82774746, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.83838284, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.14086914, + "step": 5155, + "time_per_iteration": 3.022193670272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.05217803, + "epoch": 0.9919199692189303, + "flos": 508272081408.0, + "grad_norm": 0.09168443128233669, + "language_loss": 0.80004323, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81070638, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.14135742, + "step": 5156, + "time_per_iteration": 2.74088716506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065009, + "balance_loss_mlp": 1.0509547, + "epoch": 0.9921123509041939, + "flos": 543963174912.0, + "grad_norm": 0.06903037281830961, + "language_loss": 0.83948219, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85013229, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.14074707, + "step": 5157, + "time_per_iteration": 2.6691086292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063591, + "balance_loss_mlp": 1.04928589, + "epoch": 0.9923047325894575, + "flos": 671561362944.0, + "grad_norm": 0.07427130857167381, + "language_loss": 0.7724582, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78309411, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.14294434, + "step": 5158, + "time_per_iteration": 2.823317289352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064823, + "balance_loss_mlp": 1.0502553, + "epoch": 0.9924971142747211, + "flos": 466557096960.0, + "grad_norm": 0.06658108495263643, + "language_loss": 0.80721772, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81786597, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.14562988, + "step": 5159, + "time_per_iteration": 2.6917340755462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_mlp": 1.05381215, + "epoch": 0.9926894959599846, + "flos": 491581757952.0, + "grad_norm": 0.06964329318047109, + "language_loss": 0.82796186, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83864188, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.1418457, + "step": 5160, + "time_per_iteration": 2.599081039428711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072872, + "balance_loss_mlp": 1.05843568, + "epoch": 0.9928818776452482, + "flos": 492389715456.0, + "grad_norm": 0.06970897228596049, + "language_loss": 0.81670171, + "learning_rate": 1.328673533166902e-07, + "loss": 0.82743043, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.14416504, + "step": 5161, + "time_per_iteration": 2.6220340728759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066514, + "balance_loss_mlp": 1.05213773, + "epoch": 0.9930742593305117, + "flos": 546357312000.0, + "grad_norm": 0.07366206225814581, + "language_loss": 0.84272861, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85339379, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.14355469, + "step": 5162, + "time_per_iteration": 2.735678195953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065503, + "balance_loss_mlp": 1.05122125, + "epoch": 0.9932666410157753, + "flos": 585510031872.0, + "grad_norm": 0.06375322147647451, + "language_loss": 0.85993826, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.87059331, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.1427002, + "step": 5163, + "time_per_iteration": 2.778244972229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066413, + "balance_loss_mlp": 1.05194068, + "epoch": 0.9934590227010388, + "flos": 537086909952.0, + "grad_norm": 0.06627215251949191, + "language_loss": 0.83730602, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.84797013, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.14465332, + "step": 5164, + "time_per_iteration": 2.6427829265594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064806, + "balance_loss_mlp": 1.05076265, + "epoch": 0.9936514043863024, + "flos": 518014559232.0, + "grad_norm": 0.07333210721360668, + "language_loss": 0.86763346, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.87828159, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.14038086, + "step": 5165, + "time_per_iteration": 2.648057699203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065731, + "balance_loss_mlp": 1.05143774, + "epoch": 0.993843786071566, + "flos": 744625903104.0, + "grad_norm": 0.09605012053498939, + "language_loss": 0.80456662, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81522393, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.14294434, + "step": 5166, + "time_per_iteration": 3.0385072231292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060117, + "balance_loss_mlp": 1.04587162, + "epoch": 0.9940361677568296, + "flos": 525918666240.0, + "grad_norm": 0.08892246655608081, + "language_loss": 0.82095218, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83155328, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.14245605, + "step": 5167, + "time_per_iteration": 2.6904261112213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062083, + "balance_loss_mlp": 1.04747951, + "epoch": 0.9942285494420932, + "flos": 555650108928.0, + "grad_norm": 0.08487084317420483, + "language_loss": 0.79729229, + "learning_rate": 8.735020633177104e-08, + "loss": 0.80791312, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.14599609, + "step": 5168, + "time_per_iteration": 2.8531885147094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061677, + "balance_loss_mlp": 1.0473485, + "epoch": 0.9944209311273566, + "flos": 585996788736.0, + "grad_norm": 0.06702061083159072, + "language_loss": 0.82122445, + "learning_rate": 8.162407083411872e-08, + "loss": 0.83184129, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.14318848, + "step": 5169, + "time_per_iteration": 2.822988271713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.04876781, + "epoch": 0.9946133128126202, + "flos": 735518486016.0, + "grad_norm": 0.06861911155023592, + "language_loss": 0.82474887, + "learning_rate": 7.609202086272804e-08, + "loss": 0.83537817, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.1418457, + "step": 5170, + "time_per_iteration": 3.060026168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069695, + "balance_loss_mlp": 1.0555805, + "epoch": 0.9948056944978838, + "flos": 646018011648.0, + "grad_norm": 0.0773612646357127, + "language_loss": 0.82002652, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83072352, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.14111328, + "step": 5171, + "time_per_iteration": 2.75346040725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_mlp": 1.05016601, + "epoch": 0.9949980761831474, + "flos": 445846809600.0, + "grad_norm": 0.07220922627510916, + "language_loss": 0.86264348, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87328553, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.14050293, + "step": 5172, + "time_per_iteration": 2.525021553039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064486, + "balance_loss_mlp": 1.05007386, + "epoch": 0.995190457868411, + "flos": 435637398528.0, + "grad_norm": 0.07325225932553031, + "language_loss": 0.85702819, + "learning_rate": 6.066040520641414e-08, + "loss": 0.86767304, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.14416504, + "step": 5173, + "time_per_iteration": 2.564004421234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060993, + "balance_loss_mlp": 1.04647326, + "epoch": 0.9953828395536745, + "flos": 514187315712.0, + "grad_norm": 0.08617003715305835, + "language_loss": 0.81493837, + "learning_rate": 5.590471806377062e-08, + "loss": 0.82554829, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.14526367, + "step": 5174, + "time_per_iteration": 2.6167569160461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069772, + "balance_loss_mlp": 1.05562162, + "epoch": 0.995575221238938, + "flos": 479847836160.0, + "grad_norm": 0.07342208107709478, + "language_loss": 0.81817365, + "learning_rate": 5.134312643245709e-08, + "loss": 0.82887137, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.14135742, + "step": 5175, + "time_per_iteration": 2.56459641456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064284, + "balance_loss_mlp": 1.04964542, + "epoch": 0.9957676029242016, + "flos": 587785600512.0, + "grad_norm": 0.08029056667757119, + "language_loss": 0.76727438, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77791721, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.14611816, + "step": 5176, + "time_per_iteration": 2.7907845973968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065785, + "balance_loss_mlp": 1.05072904, + "epoch": 0.9959599846094652, + "flos": 426465741312.0, + "grad_norm": 0.07919759530962187, + "language_loss": 0.79668772, + "learning_rate": 4.280223671243588e-08, + "loss": 0.80734563, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.15039062, + "step": 5177, + "time_per_iteration": 2.520141124725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060542, + "balance_loss_mlp": 1.04635572, + "epoch": 0.9961523662947287, + "flos": 611619061248.0, + "grad_norm": 0.0661716216299747, + "language_loss": 0.80615926, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.81676465, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.14196777, + "step": 5178, + "time_per_iteration": 2.9379143714904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062706, + "balance_loss_mlp": 1.04836535, + "epoch": 0.9963447479799923, + "flos": 550785111552.0, + "grad_norm": 0.07409996739278059, + "language_loss": 0.73854387, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.74917096, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.14331055, + "step": 5179, + "time_per_iteration": 2.6890337467193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_mlp": 1.05553138, + "epoch": 0.9965371296652559, + "flos": 625873402368.0, + "grad_norm": 0.08140162652865764, + "language_loss": 0.88694125, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.89763814, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.14172363, + "step": 5180, + "time_per_iteration": 2.7570343017578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069873, + "balance_loss_mlp": 1.05535316, + "epoch": 0.9967295113505195, + "flos": 639522044928.0, + "grad_norm": 0.10737901389805089, + "language_loss": 0.81821299, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.82891166, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.14526367, + "step": 5181, + "time_per_iteration": 2.881687641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069285, + "balance_loss_mlp": 1.05515885, + "epoch": 0.996921893035783, + "flos": 607389124608.0, + "grad_norm": 0.07807155766477335, + "language_loss": 0.7710281, + "learning_rate": 2.484679859793282e-08, + "loss": 0.78172094, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.14123535, + "step": 5182, + "time_per_iteration": 2.8261380195617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064976, + "balance_loss_mlp": 1.05039656, + "epoch": 0.9971142747210465, + "flos": 644162388480.0, + "grad_norm": 0.07614598959451568, + "language_loss": 0.8217324, + "learning_rate": 2.183802848243488e-08, + "loss": 0.83238214, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.14550781, + "step": 5183, + "time_per_iteration": 2.8276331424713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062789, + "balance_loss_mlp": 1.0486865, + "epoch": 0.9973066564063101, + "flos": 1040773722624.0, + "grad_norm": 0.08041083784391524, + "language_loss": 0.80840302, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.81903088, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.14123535, + "step": 5184, + "time_per_iteration": 3.434018135070801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072435, + "balance_loss_mlp": 1.05810559, + "epoch": 0.9974990380915737, + "flos": 665095131648.0, + "grad_norm": 0.08089195273991168, + "language_loss": 0.83247042, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84319472, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.14331055, + "step": 5185, + "time_per_iteration": 2.8602936267852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061744, + "balance_loss_mlp": 1.04755795, + "epoch": 0.9976914197768373, + "flos": 718121521152.0, + "grad_norm": 0.07909774692148493, + "language_loss": 0.77502704, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78564447, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.1418457, + "step": 5186, + "time_per_iteration": 2.864870071411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106677, + "balance_loss_mlp": 1.05215502, + "epoch": 0.9978838014621008, + "flos": 518328419328.0, + "grad_norm": 0.07734126987679583, + "language_loss": 0.79241562, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.8030833, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.14599609, + "step": 5187, + "time_per_iteration": 2.6048929691314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064491, + "balance_loss_mlp": 1.05000699, + "epoch": 0.9980761831473643, + "flos": 603430829568.0, + "grad_norm": 0.07051998739206643, + "language_loss": 0.84329116, + "learning_rate": 9.70582968801148e-09, + "loss": 0.85393608, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.14465332, + "step": 5188, + "time_per_iteration": 2.8364462852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065717, + "balance_loss_mlp": 1.05131662, + "epoch": 0.9982685648326279, + "flos": 453523691520.0, + "grad_norm": 0.07087888956754207, + "language_loss": 0.89041173, + "learning_rate": 7.861726879943021e-09, + "loss": 0.90106881, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.1439209, + "step": 5189, + "time_per_iteration": 2.5594921112060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067352, + "balance_loss_mlp": 1.05292726, + "epoch": 0.9984609465178915, + "flos": 481424103936.0, + "grad_norm": 0.09374409580915176, + "language_loss": 0.78683227, + "learning_rate": 6.211738235173403e-09, + "loss": 0.7975058, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.14416504, + "step": 5190, + "time_per_iteration": 2.660878896713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106628, + "balance_loss_mlp": 1.05229664, + "epoch": 0.9986533282031551, + "flos": 476941976064.0, + "grad_norm": 0.06560028389559337, + "language_loss": 0.84236324, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85302609, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.13989258, + "step": 5191, + "time_per_iteration": 2.663154363632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069579, + "balance_loss_mlp": 1.05502343, + "epoch": 0.9988457098884186, + "flos": 641948488704.0, + "grad_norm": 0.06484380916605655, + "language_loss": 0.8642782, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87497401, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.14526367, + "step": 5192, + "time_per_iteration": 2.8266706466674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065596, + "balance_loss_mlp": 1.05106461, + "epoch": 0.9990380915736822, + "flos": 396321693696.0, + "grad_norm": 0.1303267741249794, + "language_loss": 0.87754923, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.88820517, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.14538574, + "step": 5193, + "time_per_iteration": 2.4542324542999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063923, + "balance_loss_mlp": 1.04993951, + "epoch": 0.9992304732589458, + "flos": 576123259392.0, + "grad_norm": 0.07524852262078693, + "language_loss": 0.84832311, + "learning_rate": 1.552936970405927e-09, + "loss": 0.8589623, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.13989258, + "step": 5194, + "time_per_iteration": 2.7570321559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069294, + "balance_loss_mlp": 1.05532289, + "epoch": 0.9994228549442093, + "flos": 544291716096.0, + "grad_norm": 0.09657930255398448, + "language_loss": 0.75726849, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76796138, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.13964844, + "step": 5195, + "time_per_iteration": 2.6761112213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068246, + "balance_loss_mlp": 1.05446517, + "epoch": 0.9996152366294728, + "flos": 1471314502656.0, + "grad_norm": 0.07319261176496342, + "language_loss": 0.80473548, + "learning_rate": 3.882343933003796e-10, + "loss": 0.815418, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.13787842, + "step": 5196, + "time_per_iteration": 3.7586510181427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_mlp": 1.03534341, + "epoch": 0.9998076183147364, + "flos": 618950149632.0, + "grad_norm": 0.11328018299844213, + "language_loss": 0.70060062, + "learning_rate": 9.70586077619906e-11, + "loss": 0.71108079, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.12652588, + "step": 5197, + "time_per_iteration": 4.048620700836182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028912, + "balance_loss_mlp": 1.01973271, + "epoch": 1.0, + "flos": 1290737617920.0, + "grad_norm": 0.0323024732407461, + "language_loss": 0.84126532, + "learning_rate": 0.0, + "loss": 0.85155439, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.09185791, + "step": 5198, + "time_per_iteration": 5.656566858291626 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.171926856433664e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/training_args.bin b/sft_pretrain/Full_smoe/checkpoint-5198/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..653f1069449711a96532c47aa7e98309fc667b64 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:095a9ba23d3697135bb3cbeedb8658076e6e5b9f463636ff05e424e8a9161ab6 +size 7992 diff --git a/sft_pretrain/Full_smoe/checkpoint-5198/zero_to_fp32.py b/sft_pretrain/Full_smoe/checkpoint-5198/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft_pretrain/Full_smoe/checkpoint-5198/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft_pretrain/Full_smoe/config.json b/sft_pretrain/Full_smoe/config.json new file mode 100644 index 0000000000000000000000000000000000000000..2b64363dc2caac2fadd1cd4f7d110513429135d9 --- /dev/null +++ b/sft_pretrain/Full_smoe/config.json @@ -0,0 +1,200 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bal_comp_loss_coef": 0.01, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": false, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe", + "norm_softmax": false, + "normalization": false, + "num_attention_heads": 32, + "num_experts": 8, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 4, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": null, + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": false, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft_pretrain/Full_smoe/generation_config.json b/sft_pretrain/Full_smoe/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft_pretrain/Full_smoe/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft_pretrain/Full_smoe/model-00001-of-00002.safetensors b/sft_pretrain/Full_smoe/model-00001-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..29d76f5d80605301aab2bba59b53a5e2582094c4 --- /dev/null +++ b/sft_pretrain/Full_smoe/model-00001-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6c4f6ef38e8993629091331e0bbf23484cc88bdfd038f0dd17b6ec2800d855 +size 4972489328 diff --git a/sft_pretrain/Full_smoe/model-00002-of-00002.safetensors b/sft_pretrain/Full_smoe/model-00002-of-00002.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e981ed7608d232dd8a7891b5ff88b3683fe200cc --- /dev/null +++ b/sft_pretrain/Full_smoe/model-00002-of-00002.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a47d1e574adf804d0e976f951ed4e76e2f15cfdc5dceca7c9c377f2462d65ca9 +size 3759025152 diff --git a/sft_pretrain/Full_smoe/model.safetensors.index.json b/sft_pretrain/Full_smoe/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..2b3448fcaafe26e098595b9e2e5bd9e68d63ee24 --- /dev/null +++ b/sft_pretrain/Full_smoe/model.safetensors.index.json @@ -0,0 +1,672 @@ +{ + "metadata": { + "total_size": 8731424736 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors", + "model.mm_projector.layer_norm.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.layer_norm.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.4.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.5.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.6.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.0.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.bias": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.experts.7.2.weight": "model-00002-of-00002.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00002-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/sft_pretrain/Full_smoe/special_tokens_map.json b/sft_pretrain/Full_smoe/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft_pretrain/Full_smoe/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft_pretrain/Full_smoe/tokenizer.model b/sft_pretrain/Full_smoe/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft_pretrain/Full_smoe/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft_pretrain/Full_smoe/tokenizer_config.json b/sft_pretrain/Full_smoe/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft_pretrain/Full_smoe/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft_pretrain/Full_smoe/trainer_state.json b/sft_pretrain/Full_smoe/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..76d0d339ed60301f7a7a197772d63c53651c6e55 --- /dev/null +++ b/sft_pretrain/Full_smoe/trainer_state.json @@ -0,0 +1,78013 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5198, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03936368, + "balance_loss_mlp": 2.84994221, + "epoch": 0.00019238168526356292, + "flos": 470575609344.0, + "grad_norm": 15.851083319408797, + "language_loss": 2.91765308, + "learning_rate": 0.0, + "loss": 1.97528625, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 10.859375, + "step": 1, + "time_per_iteration": 22.685314178466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02018389, + "balance_loss_mlp": 1.26880157, + "epoch": 0.00038476337052712584, + "flos": 504556065792.0, + "grad_norm": 26.607348306835714, + "language_loss": 2.4131012, + "learning_rate": 0.00013726078121135892, + "loss": 2.43328524, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 7.48828125, + "step": 2, + "time_per_iteration": 2.6085429191589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02035932, + "balance_loss_mlp": 1.28710687, + "epoch": 0.0005771450557906887, + "flos": 599161245696.0, + "grad_norm": 23.480566200669728, + "language_loss": 2.12185097, + "learning_rate": 0.00021755319103969496, + "loss": 2.14221001, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 7.48046875, + "step": 3, + "time_per_iteration": 2.817356824874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02060169, + "balance_loss_mlp": 1.30028164, + "epoch": 0.0007695267410542517, + "flos": 580405326336.0, + "grad_norm": 4.363008200765646, + "language_loss": 1.37660766, + "learning_rate": 0.00027452156242271784, + "loss": 1.39720929, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 7.59375, + "step": 4, + "time_per_iteration": 2.7677674293518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02070568, + "balance_loss_mlp": 1.31411338, + "epoch": 0.0009619084263178145, + "flos": 486116204544.0, + "grad_norm": 0.9313793007477466, + "language_loss": 1.33924747, + "learning_rate": 0.0003187096642208417, + "loss": 1.35995317, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 7.55859375, + "step": 5, + "time_per_iteration": 2.649566650390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02075998, + "balance_loss_mlp": 1.31763589, + "epoch": 0.0011542901115813775, + "flos": 560028349440.0, + "grad_norm": 2.3251391322215498, + "language_loss": 1.31535721, + "learning_rate": 0.0003548139722510539, + "loss": 1.33611727, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 7.578125, + "step": 6, + "time_per_iteration": 2.715332269668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02105134, + "balance_loss_mlp": 1.3406682, + "epoch": 0.0013466717968449403, + "flos": 533966307840.0, + "grad_norm": 0.8930621517096357, + "language_loss": 1.22756648, + "learning_rate": 0.00038533972973918044, + "loss": 1.24861789, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 7.640625, + "step": 7, + "time_per_iteration": 2.620546340942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02159823, + "balance_loss_mlp": 1.38276935, + "epoch": 0.0015390534821085034, + "flos": 492295739904.0, + "grad_norm": 0.2913046553555926, + "language_loss": 1.17756534, + "learning_rate": 0.0004117823436340768, + "loss": 1.19916344, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 7.76171875, + "step": 8, + "time_per_iteration": 2.6581108570098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02180456, + "balance_loss_mlp": 1.39310265, + "epoch": 0.0017314351673720662, + "flos": 564676033536.0, + "grad_norm": 0.2812491955312875, + "language_loss": 1.24828589, + "learning_rate": 0.00043510638207938993, + "loss": 1.27009046, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 7.859375, + "step": 9, + "time_per_iteration": 2.7921459674835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02220606, + "balance_loss_mlp": 1.43058181, + "epoch": 0.001923816852635629, + "flos": 593406798336.0, + "grad_norm": 0.32786568158439683, + "language_loss": 1.14205348, + "learning_rate": 0.00045597044543220066, + "loss": 1.16425967, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 7.8984375, + "step": 10, + "time_per_iteration": 2.7258670330047607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0223461, + "balance_loss_mlp": 1.43886435, + "epoch": 0.002116198537899192, + "flos": 609625046016.0, + "grad_norm": 0.1860296084342833, + "language_loss": 1.11914992, + "learning_rate": 0.00047484428652143135, + "loss": 1.14149594, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 7.953125, + "step": 11, + "time_per_iteration": 2.907498359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02235376, + "balance_loss_mlp": 1.4423002, + "epoch": 0.002308580223162755, + "flos": 545129409024.0, + "grad_norm": 0.11947281146450546, + "language_loss": 1.17959428, + "learning_rate": 0.0004920747534624128, + "loss": 1.20194793, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 7.92578125, + "step": 12, + "time_per_iteration": 2.6528539657592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02218147, + "balance_loss_mlp": 1.42507148, + "epoch": 0.002500961908426318, + "flos": 644750461440.0, + "grad_norm": 0.12512408660007263, + "language_loss": 1.20210767, + "learning_rate": 0.0005079252465375872, + "loss": 1.22428906, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 7.921875, + "step": 13, + "time_per_iteration": 2.8123886585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02214103, + "balance_loss_mlp": 1.42140937, + "epoch": 0.0026933435936898806, + "flos": 487853259264.0, + "grad_norm": 0.16684445783545154, + "language_loss": 1.10100055, + "learning_rate": 0.0005226005109505393, + "loss": 1.12314165, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 7.9140625, + "step": 14, + "time_per_iteration": 2.628995180130005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130591, + "balance_loss_mlp": 1.36459994, + "epoch": 0.0028857252789534437, + "flos": 434599644672.0, + "grad_norm": 0.1391159076902598, + "language_loss": 1.15644169, + "learning_rate": 0.0005362628552605367, + "loss": 1.17774749, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 7.65234375, + "step": 15, + "time_per_iteration": 2.650690793991089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02123252, + "balance_loss_mlp": 1.36260176, + "epoch": 0.0030781069642170067, + "flos": 596739944448.0, + "grad_norm": 0.12794674976623602, + "language_loss": 1.19969535, + "learning_rate": 0.0005490431248454357, + "loss": 1.22092795, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 7.6015625, + "step": 16, + "time_per_iteration": 2.7189841270446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0240823, + "balance_loss_mlp": 1.66054928, + "epoch": 0.0032704886494805694, + "flos": 1538188102656.0, + "grad_norm": 0.2699272965631097, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.78113341, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 7.46875, + "step": 17, + "time_per_iteration": 5.958680868148804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02007176, + "balance_loss_mlp": 1.28352785, + "epoch": 0.0034628703347441324, + "flos": 473969677824.0, + "grad_norm": 0.08195889268296155, + "language_loss": 1.0631001, + "learning_rate": 0.0005723671632907488, + "loss": 1.08317184, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 7.23046875, + "step": 18, + "time_per_iteration": 2.633267879486084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01953804, + "balance_loss_mlp": 1.2572403, + "epoch": 0.0036552520200076955, + "flos": 448537554432.0, + "grad_norm": 0.11150538237586374, + "language_loss": 1.11837816, + "learning_rate": 0.0005830738490244919, + "loss": 1.13791621, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 6.97265625, + "step": 19, + "time_per_iteration": 2.526186466217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01920231, + "balance_loss_mlp": 1.24464774, + "epoch": 0.003847633705271258, + "flos": 636174217728.0, + "grad_norm": 0.09041890124841255, + "language_loss": 1.13942695, + "learning_rate": 0.0005932312266435596, + "loss": 1.15862942, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 6.76171875, + "step": 20, + "time_per_iteration": 2.8158531188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01861181, + "balance_loss_mlp": 1.21687818, + "epoch": 0.004040015390534821, + "flos": 589495491072.0, + "grad_norm": 0.1379829587383013, + "language_loss": 1.09075773, + "learning_rate": 0.0006028929207788754, + "loss": 1.10936952, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 6.44140625, + "step": 21, + "time_per_iteration": 2.7115283012390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01816904, + "balance_loss_mlp": 1.19815993, + "epoch": 0.004232397075798384, + "flos": 756574940160.0, + "grad_norm": 0.09955042249077097, + "language_loss": 1.11992621, + "learning_rate": 0.0006121050677327902, + "loss": 1.13809526, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 6.1796875, + "step": 22, + "time_per_iteration": 2.9170944690704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01769897, + "balance_loss_mlp": 1.18281531, + "epoch": 0.004424778761061947, + "flos": 526692119040.0, + "grad_norm": 0.08735903991507939, + "language_loss": 1.03007698, + "learning_rate": 0.0006209076479463684, + "loss": 1.04777598, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 5.8671875, + "step": 23, + "time_per_iteration": 2.6403517723083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01733821, + "balance_loss_mlp": 1.17191648, + "epoch": 0.00461716044632551, + "flos": 548168518656.0, + "grad_norm": 0.08709154861799764, + "language_loss": 1.12691391, + "learning_rate": 0.0006293355346737718, + "loss": 1.14425218, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 5.62890625, + "step": 24, + "time_per_iteration": 2.706193208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01681551, + "balance_loss_mlp": 1.14711165, + "epoch": 0.004809542131589073, + "flos": 567584091648.0, + "grad_norm": 0.08429969570703955, + "language_loss": 1.08894634, + "learning_rate": 0.0006374193284416834, + "loss": 1.10576177, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 5.34765625, + "step": 25, + "time_per_iteration": 2.788973808288574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01660379, + "balance_loss_mlp": 1.15416873, + "epoch": 0.005001923816852636, + "flos": 470629191168.0, + "grad_norm": 0.1402992304177309, + "language_loss": 1.07612705, + "learning_rate": 0.0006451860277489461, + "loss": 1.09273076, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 5.05859375, + "step": 26, + "time_per_iteration": 2.6577279567718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01646, + "balance_loss_mlp": 1.17107058, + "epoch": 0.005194305502116198, + "flos": 415502701056.0, + "grad_norm": 0.16239245775773925, + "language_loss": 1.14940214, + "learning_rate": 0.0006526595731190848, + "loss": 1.16586208, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 4.73828125, + "step": 27, + "time_per_iteration": 2.4788224697113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01586113, + "balance_loss_mlp": 1.1497122, + "epoch": 0.005386687187379761, + "flos": 628771548672.0, + "grad_norm": 0.0939755899328463, + "language_loss": 1.08969474, + "learning_rate": 0.0006598612921618983, + "loss": 1.10555601, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 4.375, + "step": 28, + "time_per_iteration": 2.8451075553894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01530584, + "balance_loss_mlp": 1.12393713, + "epoch": 0.005579068872643324, + "flos": 886483201536.0, + "grad_norm": 0.08153278055262643, + "language_loss": 1.02661419, + "learning_rate": 0.0006668102665011454, + "loss": 1.04191995, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 4.0703125, + "step": 29, + "time_per_iteration": 3.3112235069274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0149795, + "balance_loss_mlp": 1.11743355, + "epoch": 0.005771450557906887, + "flos": 547560622080.0, + "grad_norm": 0.14907920412525114, + "language_loss": 1.11315072, + "learning_rate": 0.0006735236364718957, + "loss": 1.1281302, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 3.80273438, + "step": 30, + "time_per_iteration": 2.744025945663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444605, + "balance_loss_mlp": 1.09651423, + "epoch": 0.00596383224317045, + "flos": 532026620928.0, + "grad_norm": 0.1454661106932218, + "language_loss": 1.10029531, + "learning_rate": 0.0006800168558381346, + "loss": 1.11474133, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 3.484375, + "step": 31, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01408917, + "balance_loss_mlp": 1.08962691, + "epoch": 0.0061562139284340135, + "flos": 589082886144.0, + "grad_norm": 0.13886710462644744, + "language_loss": 1.12821865, + "learning_rate": 0.0006863039060567947, + "loss": 1.14230776, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 3.19140625, + "step": 32, + "time_per_iteration": 2.778316020965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01386345, + "balance_loss_mlp": 1.0916599, + "epoch": 0.006348595613697576, + "flos": 618231025152.0, + "grad_norm": 0.0950000822083296, + "language_loss": 1.06182003, + "learning_rate": 0.0006923974775611263, + "loss": 1.07568347, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 2.94726562, + "step": 33, + "time_per_iteration": 2.822932243347168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01377092, + "balance_loss_mlp": 1.10586727, + "epoch": 0.006540977298961139, + "flos": 777910376448.0, + "grad_norm": 0.0933492164101247, + "language_loss": 1.02986193, + "learning_rate": 0.0006983091239737814, + "loss": 1.04363275, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 2.71484375, + "step": 34, + "time_per_iteration": 3.030482530593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01362684, + "balance_loss_mlp": 1.11224914, + "epoch": 0.006733358984224702, + "flos": 667143475200.0, + "grad_norm": 0.11255876729792032, + "language_loss": 1.0177412, + "learning_rate": 0.0007040493939600222, + "loss": 1.03136802, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 2.50195312, + "step": 35, + "time_per_iteration": 2.849836826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339928, + "balance_loss_mlp": 1.10723162, + "epoch": 0.006925740669488265, + "flos": 564372085248.0, + "grad_norm": 0.05318315286224845, + "language_loss": 1.02413034, + "learning_rate": 0.0007096279445021078, + "loss": 1.03752947, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 2.32421875, + "step": 36, + "time_per_iteration": 2.7724404335021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333933, + "balance_loss_mlp": 1.12202668, + "epoch": 0.007118122354751828, + "flos": 549887947776.0, + "grad_norm": 0.09673231095327042, + "language_loss": 1.09330344, + "learning_rate": 0.0007150536386503726, + "loss": 1.10664272, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 2.12304688, + "step": 37, + "time_per_iteration": 2.87898588180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131197, + "balance_loss_mlp": 1.11932778, + "epoch": 0.007310504040015391, + "flos": 702490973184.0, + "grad_norm": 0.1501774474729275, + "language_loss": 1.02011764, + "learning_rate": 0.0007203346302358509, + "loss": 1.03323734, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 1.92578125, + "step": 38, + "time_per_iteration": 2.9664244651794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01301697, + "balance_loss_mlp": 1.11916423, + "epoch": 0.007502885725278953, + "flos": 599316890112.0, + "grad_norm": 0.13354549864664766, + "language_loss": 1.06722176, + "learning_rate": 0.000725478437577282, + "loss": 1.08023882, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 1.82324219, + "step": 39, + "time_per_iteration": 2.8403327465057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269709, + "balance_loss_mlp": 1.10262501, + "epoch": 0.007695267410542516, + "flos": 560285309952.0, + "grad_norm": 0.06892008670865749, + "language_loss": 1.01746094, + "learning_rate": 0.0007304920078549186, + "loss": 1.03015804, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 1.671875, + "step": 40, + "time_per_iteration": 2.7219579219818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01271556, + "balance_loss_mlp": 1.1131506, + "epoch": 0.007887649095806078, + "flos": 508170765312.0, + "grad_norm": 0.1603294487215327, + "language_loss": 1.03720689, + "learning_rate": 0.0007353817735343603, + "loss": 1.04992247, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 1.58300781, + "step": 41, + "time_per_iteration": 2.7060108184814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246674, + "balance_loss_mlp": 1.10390913, + "epoch": 0.008080030781069641, + "flos": 503893840896.0, + "grad_norm": 0.0511848053751201, + "language_loss": 0.99442279, + "learning_rate": 0.0007401537019902344, + "loss": 1.00688958, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 1.42871094, + "step": 42, + "time_per_iteration": 2.633784294128418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227359, + "balance_loss_mlp": 1.0990901, + "epoch": 0.008272412466333205, + "flos": 518031811584.0, + "grad_norm": 0.10374794700504324, + "language_loss": 1.02897811, + "learning_rate": 0.0007448133392900729, + "loss": 1.04125178, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 1.28222656, + "step": 43, + "time_per_iteration": 2.7279117107391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0123262, + "balance_loss_mlp": 1.11207604, + "epoch": 0.008464794151596768, + "flos": 607974626304.0, + "grad_norm": 0.09096864884609944, + "language_loss": 0.98755985, + "learning_rate": 0.0007493658489441491, + "loss": 0.99988604, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 1.20410156, + "step": 44, + "time_per_iteration": 2.8941659927368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217323, + "balance_loss_mlp": 1.10812736, + "epoch": 0.00865717583686033, + "flos": 537929372160.0, + "grad_norm": 0.11598802445363406, + "language_loss": 1.0210619, + "learning_rate": 0.0007538160463002316, + "loss": 1.03323507, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 1.09375, + "step": 45, + "time_per_iteration": 2.7019526958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216008, + "balance_loss_mlp": 1.11510944, + "epoch": 0.008849557522123894, + "flos": 508007780352.0, + "grad_norm": 0.06911757836433406, + "language_loss": 1.05356646, + "learning_rate": 0.0007581684291577274, + "loss": 1.06572652, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 1.00927734, + "step": 46, + "time_per_iteration": 2.5990471839904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209164, + "balance_loss_mlp": 1.11603808, + "epoch": 0.009041939207387457, + "flos": 625339657728.0, + "grad_norm": 0.09057982339816145, + "language_loss": 1.08819616, + "learning_rate": 0.0007624272050891776, + "loss": 1.10028791, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.93066406, + "step": 47, + "time_per_iteration": 2.8298892974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175031, + "balance_loss_mlp": 1.09315765, + "epoch": 0.00923432089265102, + "flos": 549421014528.0, + "grad_norm": 0.06662076278867826, + "language_loss": 0.98563552, + "learning_rate": 0.0007665963158851307, + "loss": 0.99738586, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.81884766, + "step": 48, + "time_per_iteration": 2.840701103210449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175621, + "balance_loss_mlp": 1.10109115, + "epoch": 0.009426702577914583, + "flos": 562496638464.0, + "grad_norm": 0.07605871591802618, + "language_loss": 1.06984305, + "learning_rate": 0.0007706794594783609, + "loss": 1.08159924, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.74511719, + "step": 49, + "time_per_iteration": 2.7622482776641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171173, + "balance_loss_mlp": 1.10093522, + "epoch": 0.009619084263178146, + "flos": 616773325824.0, + "grad_norm": 0.048657988043197084, + "language_loss": 1.05961394, + "learning_rate": 0.0007746801096530423, + "loss": 1.07132566, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.703125, + "step": 50, + "time_per_iteration": 2.768888473510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173086, + "balance_loss_mlp": 1.10890365, + "epoch": 0.009811465948441709, + "flos": 541437986304.0, + "grad_norm": 0.10082721582881933, + "language_loss": 1.10655856, + "learning_rate": 0.0007786015338021173, + "loss": 1.11828947, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.64160156, + "step": 51, + "time_per_iteration": 2.6473164558410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155225, + "balance_loss_mlp": 1.09590614, + "epoch": 0.010003847633705272, + "flos": 535881028608.0, + "grad_norm": 0.0966315307988203, + "language_loss": 1.03207719, + "learning_rate": 0.0007824468089603051, + "loss": 1.04362941, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.59277344, + "step": 52, + "time_per_iteration": 2.6773018836975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011599, + "balance_loss_mlp": 1.10766244, + "epoch": 0.010196229318968833, + "flos": 909254315520.0, + "grad_norm": 0.060495503821074374, + "language_loss": 1.02858949, + "learning_rate": 0.0007862188363098669, + "loss": 1.04018843, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.52319336, + "step": 53, + "time_per_iteration": 3.174023389816284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150276, + "balance_loss_mlp": 1.10125709, + "epoch": 0.010388611004232396, + "flos": 585868308480.0, + "grad_norm": 0.08315393852704078, + "language_loss": 1.03287244, + "learning_rate": 0.0007899203543304438, + "loss": 1.04437518, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.48974609, + "step": 54, + "time_per_iteration": 2.7804617881774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158937, + "balance_loss_mlp": 1.11192107, + "epoch": 0.01058099268949596, + "flos": 502480558080.0, + "grad_norm": 0.13140028768126893, + "language_loss": 1.16694331, + "learning_rate": 0.0007935539507422731, + "loss": 1.1785326, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.47021484, + "step": 55, + "time_per_iteration": 2.6466386318206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137235, + "balance_loss_mlp": 1.09496331, + "epoch": 0.010773374374759523, + "flos": 544447360512.0, + "grad_norm": 0.06179334078386534, + "language_loss": 1.08511901, + "learning_rate": 0.0007971220733732573, + "loss": 1.09649134, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.42285156, + "step": 56, + "time_per_iteration": 2.7039074897766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138761, + "balance_loss_mlp": 1.10166252, + "epoch": 0.010965756060023086, + "flos": 526155803136.0, + "grad_norm": 0.08220293288244152, + "language_loss": 1.03500617, + "learning_rate": 0.0008006270400641869, + "loss": 1.04639375, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.37084961, + "step": 57, + "time_per_iteration": 2.7175657749176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113224, + "balance_loss_mlp": 1.0981698, + "epoch": 0.011158137745286649, + "flos": 576941128704.0, + "grad_norm": 0.07093854356308794, + "language_loss": 1.04580712, + "learning_rate": 0.0008040710477125043, + "loss": 1.0571295, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.34106445, + "step": 58, + "time_per_iteration": 2.7424120903015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135189, + "balance_loss_mlp": 1.10312176, + "epoch": 0.011350519430550212, + "flos": 529281547776.0, + "grad_norm": 0.07916475402820797, + "language_loss": 1.05395138, + "learning_rate": 0.0008074561805429771, + "loss": 1.06530333, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.32055664, + "step": 59, + "time_per_iteration": 2.7407617568969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130551, + "balance_loss_mlp": 1.10155916, + "epoch": 0.011542901115813775, + "flos": 555879905280.0, + "grad_norm": 0.04727062297709066, + "language_loss": 1.03273892, + "learning_rate": 0.0008107844176832545, + "loss": 1.04404449, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.28979492, + "step": 60, + "time_per_iteration": 2.6854803562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141191, + "balance_loss_mlp": 1.11353481, + "epoch": 0.011735282801077338, + "flos": 572095954944.0, + "grad_norm": 0.0952762711916136, + "language_loss": 1.04648042, + "learning_rate": 0.0008140576401132568, + "loss": 1.05789232, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.27685547, + "step": 61, + "time_per_iteration": 2.6589457988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137564, + "balance_loss_mlp": 1.11303091, + "epoch": 0.0119276644863409, + "flos": 615589839360.0, + "grad_norm": 0.07958403959823916, + "language_loss": 1.06014252, + "learning_rate": 0.0008172776370494935, + "loss": 1.07151818, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.24536133, + "step": 62, + "time_per_iteration": 2.768505334854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112985, + "balance_loss_mlp": 1.10548401, + "epoch": 0.012120046171604464, + "flos": 501084527616.0, + "grad_norm": 0.09183855716602674, + "language_loss": 1.12897038, + "learning_rate": 0.0008204461118185703, + "loss": 1.14026892, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.24353027, + "step": 63, + "time_per_iteration": 2.5573627948760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130628, + "balance_loss_mlp": 1.10793018, + "epoch": 0.012312427856868027, + "flos": 473347100160.0, + "grad_norm": 0.09747713298414284, + "language_loss": 1.02471447, + "learning_rate": 0.0008235646872681536, + "loss": 1.03602076, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.22692871, + "step": 64, + "time_per_iteration": 2.585127353668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127769, + "balance_loss_mlp": 1.10554826, + "epoch": 0.012504809542131588, + "flos": 538369141248.0, + "grad_norm": 0.10571288349070412, + "language_loss": 1.02039421, + "learning_rate": 0.0008266349107584288, + "loss": 1.03167176, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.2220459, + "step": 65, + "time_per_iteration": 2.703620433807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140513, + "balance_loss_mlp": 1.11872149, + "epoch": 0.012697191227395151, + "flos": 608730826752.0, + "grad_norm": 0.14637461076762864, + "language_loss": 1.05036354, + "learning_rate": 0.0008296582587724851, + "loss": 1.06176865, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.21801758, + "step": 66, + "time_per_iteration": 2.728839159011841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121296, + "balance_loss_mlp": 1.09962404, + "epoch": 0.012889572912658714, + "flos": 768079065600.0, + "grad_norm": 0.10157918798152736, + "language_loss": 1.03485751, + "learning_rate": 0.0008326361411800136, + "loss": 1.04607058, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.21704102, + "step": 67, + "time_per_iteration": 2.963634729385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119673, + "balance_loss_mlp": 1.09863222, + "epoch": 0.013081954597922277, + "flos": 533887013376.0, + "grad_norm": 0.046087363126128704, + "language_loss": 1.03369427, + "learning_rate": 0.0008355699051851403, + "loss": 1.044891, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.21057129, + "step": 68, + "time_per_iteration": 2.7779767513275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146637, + "balance_loss_mlp": 1.12541735, + "epoch": 0.01327433628318584, + "flos": 573096632832.0, + "grad_norm": 0.10078437623262682, + "language_loss": 1.10584092, + "learning_rate": 0.0008384608389860635, + "loss": 1.11730719, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.21228027, + "step": 69, + "time_per_iteration": 2.72163724899292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158467, + "balance_loss_mlp": 1.13795137, + "epoch": 0.013466717968449404, + "flos": 497274536448.0, + "grad_norm": 0.07269171982270876, + "language_loss": 1.00728607, + "learning_rate": 0.000841310175171381, + "loss": 1.01887083, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.2052002, + "step": 70, + "time_per_iteration": 2.653019666671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157048, + "balance_loss_mlp": 1.13693786, + "epoch": 0.013659099653712967, + "flos": 565511155200.0, + "grad_norm": 0.09340958478170322, + "language_loss": 0.98922431, + "learning_rate": 0.000844119093875517, + "loss": 1.00079489, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.2010498, + "step": 71, + "time_per_iteration": 2.722351551055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152491, + "balance_loss_mlp": 1.13224936, + "epoch": 0.01385148133897653, + "flos": 573820526592.0, + "grad_norm": 0.08018714642813927, + "language_loss": 1.04454517, + "learning_rate": 0.0008468887257134666, + "loss": 1.05607009, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.20239258, + "step": 72, + "time_per_iteration": 2.7619922161102295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134825, + "balance_loss_mlp": 1.11441696, + "epoch": 0.014043863024240093, + "flos": 576822560256.0, + "grad_norm": 0.07872027680195416, + "language_loss": 1.06334233, + "learning_rate": 0.0008496201545131264, + "loss": 1.07469058, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.20410156, + "step": 73, + "time_per_iteration": 2.7532896995544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135406, + "balance_loss_mlp": 1.11529493, + "epoch": 0.014236244709503656, + "flos": 938681809920.0, + "grad_norm": 0.07696506497764126, + "language_loss": 1.03964853, + "learning_rate": 0.0008523144198617317, + "loss": 1.0510025, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.20092773, + "step": 74, + "time_per_iteration": 3.220428943634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_mlp": 1.11140466, + "epoch": 0.014428626394767219, + "flos": 528483502080.0, + "grad_norm": 0.08624589903961616, + "language_loss": 1.03597379, + "learning_rate": 0.0008549725194813783, + "loss": 1.04729605, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.20825195, + "step": 75, + "time_per_iteration": 2.6929681301116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126542, + "balance_loss_mlp": 1.1071701, + "epoch": 0.014621008080030782, + "flos": 803752533504.0, + "grad_norm": 0.1408086440247197, + "language_loss": 1.02827942, + "learning_rate": 0.0008575954114472099, + "loss": 1.03954494, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.19360352, + "step": 76, + "time_per_iteration": 3.1799752712249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139543, + "balance_loss_mlp": 1.12005258, + "epoch": 0.014813389765294343, + "flos": 696941356032.0, + "grad_norm": 0.07592994584910524, + "language_loss": 1.00451732, + "learning_rate": 0.0008601840162606118, + "loss": 1.01591277, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.19470215, + "step": 77, + "time_per_iteration": 3.0833282470703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138555, + "balance_loss_mlp": 1.11827779, + "epoch": 0.015005771450557906, + "flos": 596994333696.0, + "grad_norm": 0.08431129228371863, + "language_loss": 1.0643971, + "learning_rate": 0.000862739218788641, + "loss": 1.07578266, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.20275879, + "step": 78, + "time_per_iteration": 2.8568053245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141293, + "balance_loss_mlp": 1.121135, + "epoch": 0.01519815313582147, + "flos": 549416245248.0, + "grad_norm": 0.11686764405791189, + "language_loss": 1.04346561, + "learning_rate": 0.0008652618700799138, + "loss": 1.05487859, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.20153809, + "step": 79, + "time_per_iteration": 2.6828417778015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144733, + "balance_loss_mlp": 1.12453914, + "epoch": 0.015390534821085032, + "flos": 430532692992.0, + "grad_norm": 0.10817734170715895, + "language_loss": 1.03413367, + "learning_rate": 0.0008677527890662774, + "loss": 1.0455811, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.2019043, + "step": 80, + "time_per_iteration": 2.4982268810272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142756, + "balance_loss_mlp": 1.12232339, + "epoch": 0.015582916506348595, + "flos": 524119942656.0, + "grad_norm": 0.09792899658664883, + "language_loss": 1.04667735, + "learning_rate": 0.0008702127641587799, + "loss": 1.05810475, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.20422363, + "step": 81, + "time_per_iteration": 2.7113406658172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136455, + "balance_loss_mlp": 1.11561751, + "epoch": 0.015775298191612157, + "flos": 575443782144.0, + "grad_norm": 0.10099710945407976, + "language_loss": 1.00204504, + "learning_rate": 0.0008726425547457192, + "loss": 1.01340961, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.20825195, + "step": 82, + "time_per_iteration": 2.8304948806762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140166, + "balance_loss_mlp": 1.12054384, + "epoch": 0.01596767987687572, + "flos": 610319577600.0, + "grad_norm": 0.11260184265628481, + "language_loss": 0.99513066, + "learning_rate": 0.0008750428925998964, + "loss": 1.00653231, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.19604492, + "step": 83, + "time_per_iteration": 2.762498617172241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147828, + "balance_loss_mlp": 1.12830114, + "epoch": 0.016160061562139283, + "flos": 567136982016.0, + "grad_norm": 0.1180796768296156, + "language_loss": 1.05058432, + "learning_rate": 0.0008774144832015932, + "loss": 1.06206274, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.19519043, + "step": 84, + "time_per_iteration": 2.749310255050659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01971265, + "balance_loss_mlp": 1.95724583, + "epoch": 0.016352443247402846, + "flos": 1411343543808.0, + "grad_norm": 0.4228509486674634, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.7674557, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.140625, + "step": 85, + "time_per_iteration": 4.626708745956421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137886, + "balance_loss_mlp": 1.11834753, + "epoch": 0.01654482493266641, + "flos": 730497844224.0, + "grad_norm": 0.09445954258913132, + "language_loss": 1.0054847, + "learning_rate": 0.0008820741205014318, + "loss": 1.01686358, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1953125, + "step": 86, + "time_per_iteration": 2.918696403503418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145479, + "balance_loss_mlp": 1.12540436, + "epoch": 0.016737206617929972, + "flos": 536293633536.0, + "grad_norm": 0.14940142735495454, + "language_loss": 1.02554607, + "learning_rate": 0.0008843634575408404, + "loss": 1.03700089, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.20068359, + "step": 87, + "time_per_iteration": 2.6972436904907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140944, + "balance_loss_mlp": 1.12226439, + "epoch": 0.016929588303193535, + "flos": 536990363136.0, + "grad_norm": 0.07729580722494055, + "language_loss": 1.03912258, + "learning_rate": 0.0008866266301555082, + "loss": 1.0505321, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.18676758, + "step": 88, + "time_per_iteration": 2.741374969482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164989, + "balance_loss_mlp": 1.14647579, + "epoch": 0.017121969988457098, + "flos": 526756359168.0, + "grad_norm": 0.12135336715529384, + "language_loss": 1.04746294, + "learning_rate": 0.0008888642296509615, + "loss": 1.05911291, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.18493652, + "step": 89, + "time_per_iteration": 2.62099552154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183021, + "balance_loss_mlp": 1.16370893, + "epoch": 0.01731435167372066, + "flos": 625596618240.0, + "grad_norm": 0.13101998707614188, + "language_loss": 1.08785903, + "learning_rate": 0.0008910768275115906, + "loss": 1.09968925, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.19311523, + "step": 90, + "time_per_iteration": 2.819420099258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181615, + "balance_loss_mlp": 1.16215992, + "epoch": 0.017506733358984224, + "flos": 496402338816.0, + "grad_norm": 0.1050230223941115, + "language_loss": 1.04935551, + "learning_rate": 0.0008932649762767675, + "loss": 1.06117165, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.19445801, + "step": 91, + "time_per_iteration": 2.622406244277954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182279, + "balance_loss_mlp": 1.16277599, + "epoch": 0.017699115044247787, + "flos": 745933100544.0, + "grad_norm": 0.08683374673810437, + "language_loss": 1.07276869, + "learning_rate": 0.0008954292103690864, + "loss": 1.08459151, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.19494629, + "step": 92, + "time_per_iteration": 2.9198801517486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185402, + "balance_loss_mlp": 1.16578054, + "epoch": 0.01789149672951135, + "flos": 515509194240.0, + "grad_norm": 0.18507766534910622, + "language_loss": 1.0957979, + "learning_rate": 0.0008975700468778296, + "loss": 1.10765195, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.19616699, + "step": 93, + "time_per_iteration": 2.6395699977874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183129, + "balance_loss_mlp": 1.1639359, + "epoch": 0.018083878414774913, + "flos": 586125268992.0, + "grad_norm": 0.14308809926092464, + "language_loss": 1.0301311, + "learning_rate": 0.0008996879863005366, + "loss": 1.04196239, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.19189453, + "step": 94, + "time_per_iteration": 2.685325860977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192741, + "balance_loss_mlp": 1.17335784, + "epoch": 0.018276260100038477, + "flos": 497356028928.0, + "grad_norm": 0.08942422865693514, + "language_loss": 1.02994668, + "learning_rate": 0.0009017835132453337, + "loss": 1.04187417, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.19360352, + "step": 95, + "time_per_iteration": 2.640179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185298, + "balance_loss_mlp": 1.1659379, + "epoch": 0.01846864178530204, + "flos": 640058360832.0, + "grad_norm": 0.12775293798220247, + "language_loss": 1.03491902, + "learning_rate": 0.0009038570970964896, + "loss": 1.046772, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.19348145, + "step": 96, + "time_per_iteration": 2.8062894344329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153754, + "balance_loss_mlp": 1.13440657, + "epoch": 0.018661023470565603, + "flos": 511662127104.0, + "grad_norm": 0.07493462569205835, + "language_loss": 1.00418913, + "learning_rate": 0.0009059091926454854, + "loss": 1.01572669, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.1932373, + "step": 97, + "time_per_iteration": 2.625839948654175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147507, + "balance_loss_mlp": 1.12845731, + "epoch": 0.018853405155829166, + "flos": 931106244096.0, + "grad_norm": 0.09820444328466757, + "language_loss": 0.99835473, + "learning_rate": 0.0009079402406897198, + "loss": 1.00982976, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.19042969, + "step": 98, + "time_per_iteration": 3.2515511512756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153758, + "balance_loss_mlp": 1.13449359, + "epoch": 0.01904578684109273, + "flos": 576484107264.0, + "grad_norm": 0.1057784840893083, + "language_loss": 1.01116824, + "learning_rate": 0.0009099506686008212, + "loss": 1.02270579, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.19262695, + "step": 99, + "time_per_iteration": 2.8564164638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131688, + "balance_loss_mlp": 1.11337709, + "epoch": 0.019238168526356292, + "flos": 558442169856.0, + "grad_norm": 0.06422494393503501, + "language_loss": 1.04474521, + "learning_rate": 0.0009119408908644013, + "loss": 1.0560621, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.18310547, + "step": 100, + "time_per_iteration": 2.717921495437622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126129, + "balance_loss_mlp": 1.10765147, + "epoch": 0.019430550211619855, + "flos": 723851375616.0, + "grad_norm": 0.13157053780302536, + "language_loss": 1.09764636, + "learning_rate": 0.0009139113095929519, + "loss": 1.1089077, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.18469238, + "step": 101, + "time_per_iteration": 2.8778345584869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147568, + "balance_loss_mlp": 1.12801814, + "epoch": 0.019622931896883418, + "flos": 499478524416.0, + "grad_norm": 0.09138386946816152, + "language_loss": 1.03731561, + "learning_rate": 0.0009158623150134762, + "loss": 1.04879129, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.19543457, + "step": 102, + "time_per_iteration": 2.588974952697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127418, + "balance_loss_mlp": 1.10807002, + "epoch": 0.01981531358214698, + "flos": 509188695552.0, + "grad_norm": 0.09239208832300977, + "language_loss": 1.03516126, + "learning_rate": 0.000917794285931332, + "loss": 1.04643536, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.1932373, + "step": 103, + "time_per_iteration": 2.680100917816162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126804, + "balance_loss_mlp": 1.10709858, + "epoch": 0.020007695267410544, + "flos": 521347705344.0, + "grad_norm": 0.06521053042835766, + "language_loss": 0.95701432, + "learning_rate": 0.0009197075901716639, + "loss": 0.96828234, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.19689941, + "step": 104, + "time_per_iteration": 2.730409860610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154929, + "balance_loss_mlp": 1.13441312, + "epoch": 0.020200076952674107, + "flos": 533298940416.0, + "grad_norm": 0.1079045695165621, + "language_loss": 1.06002212, + "learning_rate": 0.0009216025849997171, + "loss": 1.07157135, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.2052002, + "step": 105, + "time_per_iteration": 2.8010010719299316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125545, + "balance_loss_mlp": 1.10562515, + "epoch": 0.020392458637937667, + "flos": 684760324608.0, + "grad_norm": 0.06774775888709755, + "language_loss": 1.00999045, + "learning_rate": 0.0009234796175212258, + "loss": 1.02124596, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.19909668, + "step": 106, + "time_per_iteration": 3.0094785690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134088, + "balance_loss_mlp": 1.11433506, + "epoch": 0.02058484032320123, + "flos": 702115444224.0, + "grad_norm": 0.09956588263492473, + "language_loss": 1.04219186, + "learning_rate": 0.000925339025064007, + "loss": 1.05353272, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.1973877, + "step": 107, + "time_per_iteration": 2.9836714267730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112225, + "balance_loss_mlp": 1.1024735, + "epoch": 0.020777222008464793, + "flos": 639082275840.0, + "grad_norm": 0.06168154311284234, + "language_loss": 0.97232246, + "learning_rate": 0.0009271811355418027, + "loss": 0.98354501, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19775391, + "step": 108, + "time_per_iteration": 2.860042095184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120623, + "balance_loss_mlp": 1.10089409, + "epoch": 0.020969603693728356, + "flos": 682091974656.0, + "grad_norm": 0.10451884090896614, + "language_loss": 1.03835416, + "learning_rate": 0.0009290062678013548, + "loss": 1.04956043, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19714355, + "step": 109, + "time_per_iteration": 2.8912689685821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116129, + "balance_loss_mlp": 1.09641171, + "epoch": 0.02116198537899192, + "flos": 533395487232.0, + "grad_norm": 0.12087494450090952, + "language_loss": 1.02292705, + "learning_rate": 0.0009308147319536321, + "loss": 1.03408837, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.19702148, + "step": 110, + "time_per_iteration": 2.682143449783325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123774, + "balance_loss_mlp": 1.10437846, + "epoch": 0.021354367064255482, + "flos": 717479119872.0, + "grad_norm": 0.09468465669548881, + "language_loss": 1.08714509, + "learning_rate": 0.0009326068296900676, + "loss": 1.09838271, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.19372559, + "step": 111, + "time_per_iteration": 2.8420276641845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113573, + "balance_loss_mlp": 1.09368885, + "epoch": 0.021546748749519045, + "flos": 519556322304.0, + "grad_norm": 0.06573635575260657, + "language_loss": 1.00160766, + "learning_rate": 0.0009343828545846161, + "loss": 1.01274335, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.19873047, + "step": 112, + "time_per_iteration": 2.81919264793396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140317, + "balance_loss_mlp": 1.1205641, + "epoch": 0.021739130434782608, + "flos": 505161391104.0, + "grad_norm": 0.10387186502959084, + "language_loss": 1.03632593, + "learning_rate": 0.0009361430923823841, + "loss": 1.04772925, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1973877, + "step": 113, + "time_per_iteration": 2.6119744777679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125249, + "balance_loss_mlp": 1.1051383, + "epoch": 0.02193151212004617, + "flos": 463486053888.0, + "grad_norm": 0.07902170601711563, + "language_loss": 1.07192981, + "learning_rate": 0.0009378878212755459, + "loss": 1.08318233, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.2010498, + "step": 114, + "time_per_iteration": 2.511798143386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121266, + "balance_loss_mlp": 1.10053515, + "epoch": 0.022123893805309734, + "flos": 552272546304.0, + "grad_norm": 0.07803352047235128, + "language_loss": 0.97866738, + "learning_rate": 0.0009396173121672103, + "loss": 0.98988008, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.20739746, + "step": 115, + "time_per_iteration": 2.664508819580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129328, + "balance_loss_mlp": 1.10866928, + "epoch": 0.022316275490573297, + "flos": 636211293696.0, + "grad_norm": 0.0856857268925464, + "language_loss": 1.03136635, + "learning_rate": 0.0009413318289238633, + "loss": 1.04265964, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.20666504, + "step": 116, + "time_per_iteration": 2.78078031539917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107565, + "balance_loss_mlp": 1.08696532, + "epoch": 0.02250865717583686, + "flos": 798890107392.0, + "grad_norm": 0.07931864844683259, + "language_loss": 0.9541564, + "learning_rate": 0.0009430316286169771, + "loss": 0.96523207, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.20605469, + "step": 117, + "time_per_iteration": 3.034813404083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162526, + "balance_loss_mlp": 1.14062762, + "epoch": 0.022701038861100423, + "flos": 456093296640.0, + "grad_norm": 0.10907247817659571, + "language_loss": 1.00617993, + "learning_rate": 0.0009447169617543361, + "loss": 1.0178051, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.21899414, + "step": 118, + "time_per_iteration": 2.6340808868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173058, + "balance_loss_mlp": 1.15192246, + "epoch": 0.022893420546363986, + "flos": 583086159360.0, + "grad_norm": 0.12286397781369558, + "language_loss": 1.06791735, + "learning_rate": 0.0009463880725016029, + "loss": 1.0796479, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.21142578, + "step": 119, + "time_per_iteration": 2.7167999744415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112992, + "balance_loss_mlp": 1.10922527, + "epoch": 0.02308580223162755, + "flos": 561303240192.0, + "grad_norm": 0.1818455397825579, + "language_loss": 1.0306797, + "learning_rate": 0.0009480451988946134, + "loss": 1.04197884, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.20703125, + "step": 120, + "time_per_iteration": 2.8320834636688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127416, + "balance_loss_mlp": 1.10706663, + "epoch": 0.023278183916891113, + "flos": 771300983808.0, + "grad_norm": 0.061341545621049966, + "language_loss": 1.03699958, + "learning_rate": 0.0009496885730428627, + "loss": 1.04827368, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.20349121, + "step": 121, + "time_per_iteration": 3.0393545627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141944, + "balance_loss_mlp": 1.12239408, + "epoch": 0.023470565602154676, + "flos": 553374540288.0, + "grad_norm": 0.12547580017264032, + "language_loss": 1.01912796, + "learning_rate": 0.0009513184213246156, + "loss": 1.0305475, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.19543457, + "step": 122, + "time_per_iteration": 2.651719093322754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162278, + "balance_loss_mlp": 1.14191747, + "epoch": 0.02366294728741824, + "flos": 560028349440.0, + "grad_norm": 0.1065762842559702, + "language_loss": 1.05289114, + "learning_rate": 0.0009529349645740552, + "loss": 1.06451392, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.20361328, + "step": 123, + "time_per_iteration": 2.705214262008667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165232, + "balance_loss_mlp": 1.14444137, + "epoch": 0.0238553289726818, + "flos": 468553683456.0, + "grad_norm": 0.12380591024493681, + "language_loss": 1.04425788, + "learning_rate": 0.0009545384182608524, + "loss": 1.05591035, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.2076416, + "step": 124, + "time_per_iteration": 2.544631004333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143364, + "balance_loss_mlp": 1.12262154, + "epoch": 0.024047710657945365, + "flos": 560030920704.0, + "grad_norm": 0.07613482272455964, + "language_loss": 1.01444972, + "learning_rate": 0.0009561289926625252, + "loss": 1.0258832, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.20739746, + "step": 125, + "time_per_iteration": 2.6732449531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140968, + "balance_loss_mlp": 1.11927211, + "epoch": 0.024240092343208928, + "flos": 504775950336.0, + "grad_norm": 0.13062804118014867, + "language_loss": 1.05952811, + "learning_rate": 0.0009577068930299292, + "loss": 1.07093775, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.21691895, + "step": 126, + "time_per_iteration": 2.5860514640808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111442, + "balance_loss_mlp": 1.09249783, + "epoch": 0.02443247402847249, + "flos": 435763307520.0, + "grad_norm": 0.11550485665133546, + "language_loss": 1.01208651, + "learning_rate": 0.0009592723197462087, + "loss": 1.02323079, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.21923828, + "step": 127, + "time_per_iteration": 2.680792808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139077, + "balance_loss_mlp": 1.1162957, + "epoch": 0.024624855713736054, + "flos": 683769558528.0, + "grad_norm": 0.07531268866570652, + "language_loss": 0.98376709, + "learning_rate": 0.0009608254684795125, + "loss": 0.99515784, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.2277832, + "step": 128, + "time_per_iteration": 2.962553024291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151675, + "balance_loss_mlp": 1.12746358, + "epoch": 0.024817237398999614, + "flos": 524999480832.0, + "grad_norm": 0.10067493874109901, + "language_loss": 1.01099372, + "learning_rate": 0.0009623665303297678, + "loss": 1.02251053, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.24206543, + "step": 129, + "time_per_iteration": 2.7238845825195312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178581, + "balance_loss_mlp": 1.1552279, + "epoch": 0.025009619084263177, + "flos": 655656602112.0, + "grad_norm": 0.11648498824832396, + "language_loss": 1.04954159, + "learning_rate": 0.0009638956919697878, + "loss": 1.06132734, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.23352051, + "step": 130, + "time_per_iteration": 2.878931999206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180963, + "balance_loss_mlp": 1.15737128, + "epoch": 0.02520200076952674, + "flos": 454423053312.0, + "grad_norm": 0.07835178368021106, + "language_loss": 0.97041726, + "learning_rate": 0.0009654131357809714, + "loss": 0.98222685, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.23596191, + "step": 131, + "time_per_iteration": 2.646268367767334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187479, + "balance_loss_mlp": 1.1633389, + "epoch": 0.025394382454790303, + "flos": 839794563072.0, + "grad_norm": 0.08592709100669786, + "language_loss": 1.06445599, + "learning_rate": 0.0009669190399838441, + "loss": 1.07633078, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.24169922, + "step": 132, + "time_per_iteration": 3.1253442764282227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178278, + "balance_loss_mlp": 1.15288627, + "epoch": 0.025586764140053866, + "flos": 581025332736.0, + "grad_norm": 0.06433616224475917, + "language_loss": 0.99044776, + "learning_rate": 0.0009684135787636724, + "loss": 1.00223053, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.25402832, + "step": 133, + "time_per_iteration": 2.831838846206665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193803, + "balance_loss_mlp": 1.16735041, + "epoch": 0.02577914582531743, + "flos": 790249623552.0, + "grad_norm": 0.10671106752503096, + "language_loss": 1.03402495, + "learning_rate": 0.0009698969223913726, + "loss": 1.04596305, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.26452637, + "step": 134, + "time_per_iteration": 3.0395402908325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167127, + "balance_loss_mlp": 1.14202118, + "epoch": 0.025971527510580992, + "flos": 594958473216.0, + "grad_norm": 0.08439936893253437, + "language_loss": 1.06654739, + "learning_rate": 0.0009713692373399265, + "loss": 1.0782187, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.25109863, + "step": 135, + "time_per_iteration": 2.7715206146240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463238, + "balance_loss_mlp": 1.43119502, + "epoch": 0.026163909195844555, + "flos": 1577629716480.0, + "grad_norm": 0.13141202298162255, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.80919468, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 0.3203125, + "step": 136, + "time_per_iteration": 5.708434820175171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01366397, + "balance_loss_mlp": 1.3391223, + "epoch": 0.026356290881108118, + "flos": 1502074865664.0, + "grad_norm": 0.10789098637796743, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79177433, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 0.2734375, + "step": 137, + "time_per_iteration": 4.936312198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164239, + "balance_loss_mlp": 1.14192283, + "epoch": 0.02654867256637168, + "flos": 597140066304.0, + "grad_norm": 0.07737815407023008, + "language_loss": 0.99685001, + "learning_rate": 0.0009757216201974225, + "loss": 1.00849247, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.22338867, + "step": 138, + "time_per_iteration": 2.848794460296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186286, + "balance_loss_mlp": 1.16373122, + "epoch": 0.026741054251635244, + "flos": 545035433472.0, + "grad_norm": 0.07356199280990307, + "language_loss": 1.04477906, + "learning_rate": 0.0009771514130396581, + "loss": 1.05664206, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.22546387, + "step": 139, + "time_per_iteration": 2.735100746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191574, + "balance_loss_mlp": 1.17103469, + "epoch": 0.026933435936898807, + "flos": 506841546240.0, + "grad_norm": 0.09793912671864533, + "language_loss": 1.04422235, + "learning_rate": 0.00097857095638274, + "loss": 1.05613816, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.20544434, + "step": 140, + "time_per_iteration": 2.6398932933807373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187088, + "balance_loss_mlp": 1.16559434, + "epoch": 0.02712581762216237, + "flos": 740860328448.0, + "grad_norm": 0.08846308668893199, + "language_loss": 0.95874435, + "learning_rate": 0.0009799803961288726, + "loss": 0.97061527, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.21484375, + "step": 141, + "time_per_iteration": 3.0505003929138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160228, + "balance_loss_mlp": 1.13921118, + "epoch": 0.027318199307425933, + "flos": 848373378048.0, + "grad_norm": 0.09598553540990232, + "language_loss": 1.0168581, + "learning_rate": 0.000981379875086876, + "loss": 1.02846038, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.21020508, + "step": 142, + "time_per_iteration": 3.0870697498321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143434, + "balance_loss_mlp": 1.12091553, + "epoch": 0.027510580992689496, + "flos": 575557581312.0, + "grad_norm": 0.08800286540083159, + "language_loss": 0.96917391, + "learning_rate": 0.0009827695330590185, + "loss": 0.98060828, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.2253418, + "step": 143, + "time_per_iteration": 2.719317674636841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128989, + "balance_loss_mlp": 1.10631514, + "epoch": 0.02770296267795306, + "flos": 772420230144.0, + "grad_norm": 0.09792527853337853, + "language_loss": 0.96426451, + "learning_rate": 0.0009841495069248256, + "loss": 0.97555441, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.22692871, + "step": 144, + "time_per_iteration": 3.014765739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011276, + "balance_loss_mlp": 1.10584438, + "epoch": 0.027895344363216622, + "flos": 569387957760.0, + "grad_norm": 0.06966533855263184, + "language_loss": 0.95713264, + "learning_rate": 0.0009855199307219871, + "loss": 0.9684087, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.21777344, + "step": 145, + "time_per_iteration": 2.6709253787994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148016, + "balance_loss_mlp": 1.12558043, + "epoch": 0.028087726048480186, + "flos": 547360561152.0, + "grad_norm": 0.09436929899226476, + "language_loss": 0.97337723, + "learning_rate": 0.0009868809357244854, + "loss": 0.98485744, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.22424316, + "step": 146, + "time_per_iteration": 2.669283390045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119223, + "balance_loss_mlp": 1.09726429, + "epoch": 0.02828010773374375, + "flos": 524789508096.0, + "grad_norm": 0.0790038702862921, + "language_loss": 1.01669443, + "learning_rate": 0.0009882326505180556, + "loss": 1.02788651, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.21948242, + "step": 147, + "time_per_iteration": 2.704292058944702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138866, + "balance_loss_mlp": 1.11706281, + "epoch": 0.02847248941900731, + "flos": 772440053760.0, + "grad_norm": 0.10005424592603226, + "language_loss": 0.99863935, + "learning_rate": 0.0009895752010730906, + "loss": 1.010028, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.21801758, + "step": 148, + "time_per_iteration": 2.9581809043884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113837, + "balance_loss_mlp": 1.09122324, + "epoch": 0.028664871104270875, + "flos": 534413417472.0, + "grad_norm": 0.059614583623975884, + "language_loss": 1.06015503, + "learning_rate": 0.0009909087108150867, + "loss": 1.07129347, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.22619629, + "step": 149, + "time_per_iteration": 2.741159200668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121459, + "balance_loss_mlp": 1.09761691, + "epoch": 0.028857252789534438, + "flos": 367766396928.0, + "grad_norm": 0.11202611832617501, + "language_loss": 1.06769323, + "learning_rate": 0.0009922333006927371, + "loss": 1.07890773, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.23852539, + "step": 150, + "time_per_iteration": 2.4982028007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130563, + "balance_loss_mlp": 1.10591054, + "epoch": 0.029049634474798, + "flos": 515482030080.0, + "grad_norm": 0.07605307561327067, + "language_loss": 1.00263429, + "learning_rate": 0.0009935490892437632, + "loss": 1.01393986, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.2467041, + "step": 151, + "time_per_iteration": 2.603449583053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144852, + "balance_loss_mlp": 1.12064028, + "epoch": 0.029242016160061564, + "flos": 588141305856.0, + "grad_norm": 0.10272840367827417, + "language_loss": 0.98558784, + "learning_rate": 0.0009948561926585687, + "loss": 0.99703634, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.24206543, + "step": 152, + "time_per_iteration": 2.8270881175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152248, + "balance_loss_mlp": 1.12610579, + "epoch": 0.029434397845325123, + "flos": 552079825920.0, + "grad_norm": 0.09161667027770293, + "language_loss": 1.02430511, + "learning_rate": 0.0009961547248418122, + "loss": 1.03582752, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.26159668, + "step": 153, + "time_per_iteration": 2.6539955139160156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145984, + "balance_loss_mlp": 1.11949599, + "epoch": 0.029626779530588686, + "flos": 603497640960.0, + "grad_norm": 0.12801957864517624, + "language_loss": 0.99122071, + "learning_rate": 0.0009974447974719707, + "loss": 1.00268054, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.26477051, + "step": 154, + "time_per_iteration": 2.7382068634033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149383, + "balance_loss_mlp": 1.12209582, + "epoch": 0.02981916121585225, + "flos": 621089897472.0, + "grad_norm": 0.08800353648973465, + "language_loss": 1.01358569, + "learning_rate": 0.0009987265200589763, + "loss": 1.02507949, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.27307129, + "step": 155, + "time_per_iteration": 2.7484042644500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146509, + "balance_loss_mlp": 1.11942446, + "epoch": 0.030011542901115813, + "flos": 661633505280.0, + "grad_norm": 0.06940376161599653, + "language_loss": 1.00859666, + "learning_rate": 0.001, + "loss": 1.02006161, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.27124023, + "step": 156, + "time_per_iteration": 2.9298081398010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143379, + "balance_loss_mlp": 1.11696208, + "epoch": 0.030203924586379376, + "flos": 651569826816.0, + "grad_norm": 0.07290625745558146, + "language_loss": 0.98239183, + "learning_rate": 0.0009999999029413921, + "loss": 0.99382555, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.2644043, + "step": 157, + "time_per_iteration": 2.8521509170532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143494, + "balance_loss_mlp": 1.11851931, + "epoch": 0.03039630627164294, + "flos": 531354484224.0, + "grad_norm": 0.10227720632383495, + "language_loss": 0.99759698, + "learning_rate": 0.0009999996117656068, + "loss": 1.00903201, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.24975586, + "step": 158, + "time_per_iteration": 2.7299323081970215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132124, + "balance_loss_mlp": 1.10562384, + "epoch": 0.030588687956906502, + "flos": 586189509120.0, + "grad_norm": 0.10099325970079884, + "language_loss": 0.93055141, + "learning_rate": 0.0009999991264727564, + "loss": 0.94187272, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.26489258, + "step": 159, + "time_per_iteration": 2.838892698287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115378, + "balance_loss_mlp": 1.08908033, + "epoch": 0.030781069642170065, + "flos": 513278042112.0, + "grad_norm": 0.07019569540009855, + "language_loss": 1.04060161, + "learning_rate": 0.0009999984470630296, + "loss": 1.05175543, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.26330566, + "step": 160, + "time_per_iteration": 2.6468522548675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127141, + "balance_loss_mlp": 1.09948444, + "epoch": 0.030973451327433628, + "flos": 718123719168.0, + "grad_norm": 0.11009923170951091, + "language_loss": 0.93022841, + "learning_rate": 0.0009999975735366902, + "loss": 0.94149983, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.27636719, + "step": 161, + "time_per_iteration": 3.0944836139678955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113017, + "balance_loss_mlp": 1.10256159, + "epoch": 0.03116583301269719, + "flos": 1109771311104.0, + "grad_norm": 0.1021195191465028, + "language_loss": 0.94580781, + "learning_rate": 0.0009999965058940775, + "loss": 0.95710957, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.27624512, + "step": 162, + "time_per_iteration": 3.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140975, + "balance_loss_mlp": 1.11293721, + "epoch": 0.031358214697960754, + "flos": 450907098624.0, + "grad_norm": 0.10168672994339449, + "language_loss": 1.00657988, + "learning_rate": 0.0009999952441356057, + "loss": 1.01798964, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.28027344, + "step": 163, + "time_per_iteration": 2.5260584354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117443, + "balance_loss_mlp": 1.09220648, + "epoch": 0.031550596383224314, + "flos": 1255176870912.0, + "grad_norm": 0.059509842301402785, + "language_loss": 1.01277101, + "learning_rate": 0.000999993788261765, + "loss": 1.02394545, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.25231934, + "step": 164, + "time_per_iteration": 3.585451126098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117278, + "balance_loss_mlp": 1.09226811, + "epoch": 0.03174297806848788, + "flos": 668136812544.0, + "grad_norm": 0.08345282656088489, + "language_loss": 1.02586234, + "learning_rate": 0.00099999213827312, + "loss": 1.03703511, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.25036621, + "step": 165, + "time_per_iteration": 2.815709352493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126749, + "balance_loss_mlp": 1.10315728, + "epoch": 0.03193535975375144, + "flos": 551299032576.0, + "grad_norm": 0.06906572503593703, + "language_loss": 0.97404492, + "learning_rate": 0.000999990294170312, + "loss": 0.98531234, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.23596191, + "step": 166, + "time_per_iteration": 2.663247585296631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123432, + "balance_loss_mlp": 1.09961414, + "epoch": 0.032127741439015006, + "flos": 543649314816.0, + "grad_norm": 0.06114993163800343, + "language_loss": 1.01775765, + "learning_rate": 0.0009999882559540566, + "loss": 1.02899194, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.23803711, + "step": 167, + "time_per_iteration": 2.6779284477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113429, + "balance_loss_mlp": 1.08983719, + "epoch": 0.032320123124278566, + "flos": 548385831936.0, + "grad_norm": 0.051224230506458926, + "language_loss": 0.98247135, + "learning_rate": 0.000999986023625145, + "loss": 0.99360555, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.23571777, + "step": 168, + "time_per_iteration": 2.8207764625549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02968107, + "balance_loss_mlp": 2.92347527, + "epoch": 0.03251250480954213, + "flos": 1305886551552.0, + "grad_norm": 0.42377736764400964, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.81892526, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 0.44726562, + "step": 169, + "time_per_iteration": 5.030913591384888 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110197, + "balance_loss_mlp": 1.08739257, + "epoch": 0.03270488649480569, + "flos": 561132914688.0, + "grad_norm": 0.11749522299339567, + "language_loss": 0.99391603, + "learning_rate": 0.0009999809766328958, + "loss": 1.005018, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.22839355, + "step": 170, + "time_per_iteration": 2.7288546562194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138899, + "balance_loss_mlp": 1.11526036, + "epoch": 0.03289726818006926, + "flos": 482363112960.0, + "grad_norm": 0.10262090882217431, + "language_loss": 1.01489758, + "learning_rate": 0.0009999781619715177, + "loss": 1.02628672, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.23620605, + "step": 171, + "time_per_iteration": 2.57743239402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152618, + "balance_loss_mlp": 1.12847793, + "epoch": 0.03308964986533282, + "flos": 674647460352.0, + "grad_norm": 0.09929883602390663, + "language_loss": 1.00886559, + "learning_rate": 0.000999975153201402, + "loss": 1.0203917, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.24121094, + "step": 172, + "time_per_iteration": 2.8398427963256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164032, + "balance_loss_mlp": 1.13917661, + "epoch": 0.033282031550596385, + "flos": 609217583616.0, + "grad_norm": 0.07630899187603161, + "language_loss": 0.98461914, + "learning_rate": 0.0009999719503237174, + "loss": 0.99625951, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.24865723, + "step": 173, + "time_per_iteration": 2.7653093338012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119131, + "balance_loss_mlp": 1.16379607, + "epoch": 0.033474413235859944, + "flos": 468039762432.0, + "grad_norm": 0.11225996182460839, + "language_loss": 1.07204938, + "learning_rate": 0.0009999685533397073, + "loss": 1.08396256, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.27514648, + "step": 174, + "time_per_iteration": 2.560985565185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174784, + "balance_loss_mlp": 1.14617324, + "epoch": 0.03366679492112351, + "flos": 579634444800.0, + "grad_norm": 0.09969766363227954, + "language_loss": 0.99471402, + "learning_rate": 0.00099996496225069, + "loss": 1.00646186, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.28637695, + "step": 175, + "time_per_iteration": 2.685511589050293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191936, + "balance_loss_mlp": 1.16053653, + "epoch": 0.03385917660638707, + "flos": 637678904832.0, + "grad_norm": 0.07081110533815538, + "language_loss": 1.01830065, + "learning_rate": 0.0009999611770580604, + "loss": 1.03022003, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.31396484, + "step": 176, + "time_per_iteration": 2.848646879196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184826, + "balance_loss_mlp": 1.1498735, + "epoch": 0.03405155829165064, + "flos": 441816933888.0, + "grad_norm": 0.08630072774372038, + "language_loss": 1.00571251, + "learning_rate": 0.0009999571977632876, + "loss": 1.01756072, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.34960938, + "step": 177, + "time_per_iteration": 2.5936646461486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183799, + "balance_loss_mlp": 1.14896631, + "epoch": 0.034243939976914196, + "flos": 466332443136.0, + "grad_norm": 0.14796843181278477, + "language_loss": 1.03395152, + "learning_rate": 0.0009999530243679166, + "loss": 1.04578948, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.34863281, + "step": 178, + "time_per_iteration": 2.578585386276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119148, + "balance_loss_mlp": 1.15502596, + "epoch": 0.03443632166217776, + "flos": 779276671488.0, + "grad_norm": 0.07456630143082679, + "language_loss": 0.98466933, + "learning_rate": 0.0009999486568735675, + "loss": 0.99658418, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.36450195, + "step": 179, + "time_per_iteration": 3.0958807468414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204458, + "balance_loss_mlp": 1.16657281, + "epoch": 0.03462870334744132, + "flos": 1263777707520.0, + "grad_norm": 0.1071704794457763, + "language_loss": 0.98888862, + "learning_rate": 0.0009999440952819362, + "loss": 1.00093329, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.37841797, + "step": 180, + "time_per_iteration": 3.7027652263641357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119721, + "balance_loss_mlp": 1.1615665, + "epoch": 0.03482108503270489, + "flos": 607179151872.0, + "grad_norm": 0.09808227157719927, + "language_loss": 0.98941529, + "learning_rate": 0.0009999393395947935, + "loss": 1.00138736, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.35644531, + "step": 181, + "time_per_iteration": 2.9549217224121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178907, + "balance_loss_mlp": 1.1453855, + "epoch": 0.03501346671796845, + "flos": 538270396416.0, + "grad_norm": 0.07390299993950959, + "language_loss": 1.01616848, + "learning_rate": 0.0009999343898139858, + "loss": 1.02795744, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.33520508, + "step": 182, + "time_per_iteration": 2.6392982006073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183604, + "balance_loss_mlp": 1.15117884, + "epoch": 0.035205848403232015, + "flos": 518484063744.0, + "grad_norm": 0.07686941510438546, + "language_loss": 1.00897217, + "learning_rate": 0.0009999292459414348, + "loss": 1.02080822, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.32397461, + "step": 183, + "time_per_iteration": 2.657658338546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158427, + "balance_loss_mlp": 1.12702751, + "epoch": 0.035398230088495575, + "flos": 472373586432.0, + "grad_norm": 0.08111160194171327, + "language_loss": 1.04917085, + "learning_rate": 0.0009999239079791374, + "loss": 1.06075525, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.3137207, + "step": 184, + "time_per_iteration": 2.631359815597534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115468, + "balance_loss_mlp": 1.12359011, + "epoch": 0.03559061177375914, + "flos": 512074732032.0, + "grad_norm": 0.06813164935019152, + "language_loss": 0.98483247, + "learning_rate": 0.0009999183759291659, + "loss": 0.99637926, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.31054688, + "step": 185, + "time_per_iteration": 2.7329554557800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133475, + "balance_loss_mlp": 1.10393476, + "epoch": 0.0357829934590227, + "flos": 477386887680.0, + "grad_norm": 0.1084122935019402, + "language_loss": 1.00212467, + "learning_rate": 0.0009999126497936682, + "loss": 1.01345944, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.29516602, + "step": 186, + "time_per_iteration": 2.5334415435791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110251, + "balance_loss_mlp": 1.08080626, + "epoch": 0.03597537514428627, + "flos": 644656485888.0, + "grad_norm": 0.057007065611444814, + "language_loss": 1.03274298, + "learning_rate": 0.0009999067295748676, + "loss": 1.04384542, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.29443359, + "step": 187, + "time_per_iteration": 2.8514976501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120219, + "balance_loss_mlp": 1.09280062, + "epoch": 0.03616775682954983, + "flos": 581186119680.0, + "grad_norm": 0.1063888726335035, + "language_loss": 1.00729585, + "learning_rate": 0.000999900615275062, + "loss": 1.01849806, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.27441406, + "step": 188, + "time_per_iteration": 2.7070038318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115411, + "balance_loss_mlp": 1.08773041, + "epoch": 0.03636013851481339, + "flos": 382420859904.0, + "grad_norm": 0.10114859104821755, + "language_loss": 1.06676006, + "learning_rate": 0.0009998943068966256, + "loss": 1.07791412, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.27709961, + "step": 189, + "time_per_iteration": 2.459259271621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128247, + "balance_loss_mlp": 1.0989449, + "epoch": 0.03655252020007695, + "flos": 583224551424.0, + "grad_norm": 0.09267057508255847, + "language_loss": 1.01174653, + "learning_rate": 0.0009998878044420072, + "loss": 1.02302897, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.29296875, + "step": 190, + "time_per_iteration": 2.710602045059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128613, + "balance_loss_mlp": 1.09881067, + "epoch": 0.03674490188534051, + "flos": 471619957248.0, + "grad_norm": 0.06756260422642338, + "language_loss": 0.9758327, + "learning_rate": 0.0009998811079137318, + "loss": 0.98711884, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.2980957, + "step": 191, + "time_per_iteration": 2.657074451446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144449, + "balance_loss_mlp": 1.11092758, + "epoch": 0.03693728357060408, + "flos": 528372274176.0, + "grad_norm": 0.08238379115749897, + "language_loss": 0.98845601, + "learning_rate": 0.0009998742173143987, + "loss": 0.99990052, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.33544922, + "step": 192, + "time_per_iteration": 2.637148857116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155548, + "balance_loss_mlp": 1.12164509, + "epoch": 0.03712966525586764, + "flos": 798993994752.0, + "grad_norm": 0.08708605999438765, + "language_loss": 0.98628879, + "learning_rate": 0.0009998671326466833, + "loss": 0.99784422, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.33911133, + "step": 193, + "time_per_iteration": 3.0115370750427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152209, + "balance_loss_mlp": 1.11556399, + "epoch": 0.037322046941131205, + "flos": 829973164032.0, + "grad_norm": 0.10177169507488108, + "language_loss": 0.98986697, + "learning_rate": 0.0009998598539133362, + "loss": 1.00138903, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.36645508, + "step": 194, + "time_per_iteration": 3.144454002380371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161775, + "balance_loss_mlp": 1.12694228, + "epoch": 0.037514428626394765, + "flos": 437685742080.0, + "grad_norm": 0.0772667916065631, + "language_loss": 1.00733018, + "learning_rate": 0.0009998523811171828, + "loss": 1.01894796, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.34863281, + "step": 195, + "time_per_iteration": 2.577711820602417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158404, + "balance_loss_mlp": 1.12314177, + "epoch": 0.03770681031165833, + "flos": 511625051136.0, + "grad_norm": 0.12690614805983907, + "language_loss": 1.00355625, + "learning_rate": 0.0009998447142611248, + "loss": 1.0151403, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.3527832, + "step": 196, + "time_per_iteration": 2.690129041671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157374, + "balance_loss_mlp": 1.12332833, + "epoch": 0.03789919199692189, + "flos": 807449098752.0, + "grad_norm": 0.06577024943575122, + "language_loss": 0.94151276, + "learning_rate": 0.0009998368533481387, + "loss": 0.9530865, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.34057617, + "step": 197, + "time_per_iteration": 3.045903444290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135441, + "balance_loss_mlp": 1.10120416, + "epoch": 0.03809157368218546, + "flos": 690576814080.0, + "grad_norm": 0.07117988238902957, + "language_loss": 0.9709003, + "learning_rate": 0.0009998287983812762, + "loss": 0.98225474, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.34277344, + "step": 198, + "time_per_iteration": 2.8663957118988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153893, + "balance_loss_mlp": 1.11910725, + "epoch": 0.03828395536744902, + "flos": 517940407296.0, + "grad_norm": 0.08607783918573575, + "language_loss": 1.02875066, + "learning_rate": 0.0009998205493636646, + "loss": 1.04028964, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.34790039, + "step": 199, + "time_per_iteration": 2.6874265670776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113429, + "balance_loss_mlp": 1.10010099, + "epoch": 0.038476337052712584, + "flos": 581662964736.0, + "grad_norm": 0.06925160872633124, + "language_loss": 0.95776969, + "learning_rate": 0.0009998121062985063, + "loss": 0.96911263, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.34179688, + "step": 200, + "time_per_iteration": 2.7165024280548096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137597, + "balance_loss_mlp": 1.10424268, + "epoch": 0.03866871873797614, + "flos": 577086861312.0, + "grad_norm": 0.05789149863906192, + "language_loss": 0.98006374, + "learning_rate": 0.0009998034691890794, + "loss": 0.9914397, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.33349609, + "step": 201, + "time_per_iteration": 2.8032913208007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122408, + "balance_loss_mlp": 1.09148479, + "epoch": 0.03886110042323971, + "flos": 540731344896.0, + "grad_norm": 0.07027358299059557, + "language_loss": 1.02082264, + "learning_rate": 0.0009997946380387369, + "loss": 1.03204679, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.30932617, + "step": 202, + "time_per_iteration": 2.6880364418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113731, + "balance_loss_mlp": 1.08206952, + "epoch": 0.03905348210850327, + "flos": 718002952704.0, + "grad_norm": 0.06026816631059916, + "language_loss": 1.0439496, + "learning_rate": 0.0009997856128509076, + "loss": 1.05508685, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.31665039, + "step": 203, + "time_per_iteration": 2.8704147338867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120628, + "balance_loss_mlp": 1.089324, + "epoch": 0.039245863793766836, + "flos": 427493583360.0, + "grad_norm": 0.09379084264769941, + "language_loss": 0.99581945, + "learning_rate": 0.0009997763936290952, + "loss": 1.00702572, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.31298828, + "step": 204, + "time_per_iteration": 2.527740478515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131571, + "balance_loss_mlp": 1.09924173, + "epoch": 0.039438245479030395, + "flos": 663096347136.0, + "grad_norm": 0.09654929574768753, + "language_loss": 1.03863358, + "learning_rate": 0.0009997669803768789, + "loss": 1.04994941, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.32324219, + "step": 205, + "time_per_iteration": 2.7987287044525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114569, + "balance_loss_mlp": 1.08383656, + "epoch": 0.03963062716429396, + "flos": 635349007872.0, + "grad_norm": 0.07990731679878747, + "language_loss": 0.99632657, + "learning_rate": 0.0009997573730979134, + "loss": 1.00747228, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.30712891, + "step": 206, + "time_per_iteration": 2.73876953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03108122, + "balance_loss_mlp": 2.9547708, + "epoch": 0.03982300884955752, + "flos": 1418565975552.0, + "grad_norm": 0.40060181244225235, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.82301319, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 1.53125, + "step": 207, + "time_per_iteration": 4.722966432571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165141, + "balance_loss_mlp": 1.13169098, + "epoch": 0.04001539053482109, + "flos": 689118741504.0, + "grad_norm": 0.10630094281595456, + "language_loss": 0.98190439, + "learning_rate": 0.0009997375764747294, + "loss": 0.99355578, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.33447266, + "step": 208, + "time_per_iteration": 3.063753128051758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176428, + "balance_loss_mlp": 1.14395499, + "epoch": 0.04020777222008465, + "flos": 533639964672.0, + "grad_norm": 0.08070097632442315, + "language_loss": 0.96488065, + "learning_rate": 0.0009997273871381967, + "loss": 0.97664487, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.32470703, + "step": 209, + "time_per_iteration": 2.738070249557495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199702, + "balance_loss_mlp": 1.16675293, + "epoch": 0.040400153905348214, + "flos": 567927687168.0, + "grad_norm": 0.07976675940517855, + "language_loss": 1.01156783, + "learning_rate": 0.0009997170037902862, + "loss": 1.02356482, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.32958984, + "step": 210, + "time_per_iteration": 2.7301113605499268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207423, + "balance_loss_mlp": 1.17323339, + "epoch": 0.040592535590611774, + "flos": 713439332352.0, + "grad_norm": 0.10146454126791286, + "language_loss": 1.03771198, + "learning_rate": 0.0009997064264350292, + "loss": 1.04978609, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.34228516, + "step": 211, + "time_per_iteration": 2.8760437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199912, + "balance_loss_mlp": 1.16364872, + "epoch": 0.04078491727587533, + "flos": 578100022272.0, + "grad_norm": 0.07215586978638981, + "language_loss": 0.9769634, + "learning_rate": 0.0009996956550765317, + "loss": 0.98896253, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.36254883, + "step": 212, + "time_per_iteration": 2.704005479812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209491, + "balance_loss_mlp": 1.17389536, + "epoch": 0.0409772989611389, + "flos": 552299710464.0, + "grad_norm": 0.07781252726849613, + "language_loss": 0.9221555, + "learning_rate": 0.0009996846897189762, + "loss": 0.93425035, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.35595703, + "step": 213, + "time_per_iteration": 2.6465373039245605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209061, + "balance_loss_mlp": 1.17510998, + "epoch": 0.04116968064640246, + "flos": 555630285312.0, + "grad_norm": 0.09713937665314668, + "language_loss": 0.99115217, + "learning_rate": 0.0009996735303666193, + "loss": 1.00324273, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.33984375, + "step": 214, + "time_per_iteration": 2.7262256145477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217734, + "balance_loss_mlp": 1.18261504, + "epoch": 0.041362062331666026, + "flos": 578492803584.0, + "grad_norm": 0.0828033847449013, + "language_loss": 1.01114583, + "learning_rate": 0.0009996621770237937, + "loss": 1.02332306, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.35131836, + "step": 215, + "time_per_iteration": 2.774261951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228156, + "balance_loss_mlp": 1.19122505, + "epoch": 0.041554444016929586, + "flos": 611443593216.0, + "grad_norm": 0.09368483866206018, + "language_loss": 0.9696883, + "learning_rate": 0.0009996506296949073, + "loss": 0.98196977, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.36889648, + "step": 216, + "time_per_iteration": 2.9090492725372314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227468, + "balance_loss_mlp": 1.18944013, + "epoch": 0.04174682570219315, + "flos": 528115313664.0, + "grad_norm": 0.07734539448931728, + "language_loss": 0.9667756, + "learning_rate": 0.0009996388883844428, + "loss": 0.97905028, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.38037109, + "step": 217, + "time_per_iteration": 2.6576592922210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219854, + "balance_loss_mlp": 1.18299437, + "epoch": 0.04193920738745671, + "flos": 511506482688.0, + "grad_norm": 0.05044055802439308, + "language_loss": 1.01232481, + "learning_rate": 0.0009996269530969588, + "loss": 1.02452338, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.36865234, + "step": 218, + "time_per_iteration": 2.5997114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212901, + "balance_loss_mlp": 1.17787719, + "epoch": 0.04213158907272028, + "flos": 571490629632.0, + "grad_norm": 0.09536649242864963, + "language_loss": 0.99537694, + "learning_rate": 0.0009996148238370888, + "loss": 1.00750601, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.3503418, + "step": 219, + "time_per_iteration": 2.794192314147949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210779, + "balance_loss_mlp": 1.17465854, + "epoch": 0.04232397075798384, + "flos": 964222589952.0, + "grad_norm": 0.05253448416537987, + "language_loss": 0.95164675, + "learning_rate": 0.0009996025006095421, + "loss": 0.96375448, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.36132812, + "step": 220, + "time_per_iteration": 3.387816905975342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03435379, + "balance_loss_mlp": 3.13935852, + "epoch": 0.042516352443247404, + "flos": 1469595778560.0, + "grad_norm": 0.2631843872282414, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.81218523, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 2.953125, + "step": 221, + "time_per_iteration": 5.1907196044921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198698, + "balance_loss_mlp": 1.16422272, + "epoch": 0.042708734128510964, + "flos": 654712823808.0, + "grad_norm": 0.084470461812628, + "language_loss": 0.96455717, + "learning_rate": 0.0009995772722706307, + "loss": 0.97654414, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.3449707, + "step": 222, + "time_per_iteration": 2.817683219909668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196045, + "balance_loss_mlp": 1.16013885, + "epoch": 0.04290111581377453, + "flos": 431827407360.0, + "grad_norm": 0.09039049489351958, + "language_loss": 1.09978271, + "learning_rate": 0.0009995643671690604, + "loss": 1.11174321, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.35888672, + "step": 223, + "time_per_iteration": 2.473952293395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187658, + "balance_loss_mlp": 1.15511417, + "epoch": 0.04309349749903809, + "flos": 644676309504.0, + "grad_norm": 0.098631665550633, + "language_loss": 0.9726367, + "learning_rate": 0.0009995512681194023, + "loss": 0.98451328, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.32543945, + "step": 224, + "time_per_iteration": 2.8320233821868896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173941, + "balance_loss_mlp": 1.14256525, + "epoch": 0.04328587918430166, + "flos": 831267505152.0, + "grad_norm": 0.09492392354161142, + "language_loss": 0.95751745, + "learning_rate": 0.0009995379751267417, + "loss": 0.96925682, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.31347656, + "step": 225, + "time_per_iteration": 3.265004873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151277, + "balance_loss_mlp": 1.11923385, + "epoch": 0.043478260869565216, + "flos": 525066292224.0, + "grad_norm": 0.07692691631212083, + "language_loss": 0.96905231, + "learning_rate": 0.0009995244881962398, + "loss": 0.98056507, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.3203125, + "step": 226, + "time_per_iteration": 2.6380093097686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122529, + "balance_loss_mlp": 1.09217834, + "epoch": 0.04367064255482878, + "flos": 439484465664.0, + "grad_norm": 0.1280080940779162, + "language_loss": 0.97453952, + "learning_rate": 0.0009995108073331323, + "loss": 0.98576486, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.30322266, + "step": 227, + "time_per_iteration": 2.611384630203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116442, + "balance_loss_mlp": 1.08482742, + "epoch": 0.04386302424009234, + "flos": 507380060160.0, + "grad_norm": 0.05834750559212819, + "language_loss": 1.00860834, + "learning_rate": 0.0009994969325427309, + "loss": 1.01977265, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.31640625, + "step": 228, + "time_per_iteration": 2.690300941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123233, + "balance_loss_mlp": 1.08851922, + "epoch": 0.04405540592535591, + "flos": 540694268928.0, + "grad_norm": 0.06096273128167382, + "language_loss": 0.96635395, + "learning_rate": 0.0009994828638304218, + "loss": 0.97758633, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.34716797, + "step": 229, + "time_per_iteration": 2.666841506958008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128717, + "balance_loss_mlp": 1.093979, + "epoch": 0.04424778761061947, + "flos": 446370642432.0, + "grad_norm": 0.08326095283531681, + "language_loss": 1.02012706, + "learning_rate": 0.0009994686012016675, + "loss": 1.03141427, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.34765625, + "step": 230, + "time_per_iteration": 2.5846869945526123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153704, + "balance_loss_mlp": 1.12056351, + "epoch": 0.044440169295883035, + "flos": 700702161408.0, + "grad_norm": 0.09069798767816209, + "language_loss": 1.01831698, + "learning_rate": 0.000999454144662005, + "loss": 1.02985406, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.33154297, + "step": 231, + "time_per_iteration": 2.923693895339966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115334, + "balance_loss_mlp": 1.11736226, + "epoch": 0.044632550981146595, + "flos": 588329256960.0, + "grad_norm": 0.09055690768180072, + "language_loss": 0.95778871, + "learning_rate": 0.0009994394942170468, + "loss": 0.9693222, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.35961914, + "step": 232, + "time_per_iteration": 2.7030160427093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142015, + "balance_loss_mlp": 1.10806465, + "epoch": 0.04482493266641016, + "flos": 554797734912.0, + "grad_norm": 0.058800323500690845, + "language_loss": 0.93958372, + "learning_rate": 0.0009994246498724808, + "loss": 0.95100385, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.33984375, + "step": 233, + "time_per_iteration": 2.7212979793548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138192, + "balance_loss_mlp": 1.10519481, + "epoch": 0.04501731435167372, + "flos": 722813621760.0, + "grad_norm": 0.06773344256027236, + "language_loss": 0.96352422, + "learning_rate": 0.00099940961163407, + "loss": 0.97490609, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.33007812, + "step": 234, + "time_per_iteration": 2.901205062866211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136264, + "balance_loss_mlp": 1.10300505, + "epoch": 0.04520969603693728, + "flos": 511790607360.0, + "grad_norm": 0.061338570366332154, + "language_loss": 0.98733097, + "learning_rate": 0.0009993943795076528, + "loss": 0.99869365, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33251953, + "step": 235, + "time_per_iteration": 2.6201589107513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132034, + "balance_loss_mlp": 1.09834564, + "epoch": 0.04540207772220085, + "flos": 365058399744.0, + "grad_norm": 0.07983858027410345, + "language_loss": 1.00555849, + "learning_rate": 0.0009993789534991427, + "loss": 1.01687884, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.33691406, + "step": 236, + "time_per_iteration": 2.454946279525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132728, + "balance_loss_mlp": 1.0996356, + "epoch": 0.045594459407464406, + "flos": 522669583872.0, + "grad_norm": 0.08392760705769248, + "language_loss": 0.95816457, + "learning_rate": 0.0009993633336145287, + "loss": 0.96949184, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.33056641, + "step": 237, + "time_per_iteration": 2.6566781997680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128359, + "balance_loss_mlp": 1.09655356, + "epoch": 0.04578684109272797, + "flos": 671776104960.0, + "grad_norm": 0.08042180371297789, + "language_loss": 1.00147879, + "learning_rate": 0.0009993475198598752, + "loss": 1.01276243, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.31811523, + "step": 238, + "time_per_iteration": 3.0513856410980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126034, + "balance_loss_mlp": 1.09301257, + "epoch": 0.04597922277799153, + "flos": 541633277952.0, + "grad_norm": 0.0829568534139584, + "language_loss": 0.95935237, + "learning_rate": 0.0009993315122413212, + "loss": 0.97061276, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.33007812, + "step": 239, + "time_per_iteration": 2.659076690673828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138389, + "balance_loss_mlp": 1.10458112, + "epoch": 0.0461716044632551, + "flos": 458984102400.0, + "grad_norm": 0.07781318144537454, + "language_loss": 0.96732402, + "learning_rate": 0.0009993153107650818, + "loss": 0.97870797, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.33813477, + "step": 240, + "time_per_iteration": 2.6491312980651855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141869, + "balance_loss_mlp": 1.10593915, + "epoch": 0.04636398614851866, + "flos": 455240922624.0, + "grad_norm": 0.09031233919320754, + "language_loss": 0.95913565, + "learning_rate": 0.0009992989154374468, + "loss": 0.97055435, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.35961914, + "step": 241, + "time_per_iteration": 2.5679047107696533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153823, + "balance_loss_mlp": 1.11829901, + "epoch": 0.046556367833782225, + "flos": 556826254848.0, + "grad_norm": 0.07248816816072506, + "language_loss": 1.03108311, + "learning_rate": 0.0009992823262647817, + "loss": 1.04262137, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.35546875, + "step": 242, + "time_per_iteration": 2.7263669967651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146146, + "balance_loss_mlp": 1.11167073, + "epoch": 0.046748749519045785, + "flos": 592917470208.0, + "grad_norm": 0.08958834607355992, + "language_loss": 0.96952182, + "learning_rate": 0.0009992655432535264, + "loss": 0.98098326, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.3449707, + "step": 243, + "time_per_iteration": 2.7712135314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156392, + "balance_loss_mlp": 1.12115347, + "epoch": 0.04694113120430935, + "flos": 569864802816.0, + "grad_norm": 0.06980487860605987, + "language_loss": 0.97863543, + "learning_rate": 0.0009992485664101973, + "loss": 0.99019933, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.35229492, + "step": 244, + "time_per_iteration": 2.7024378776550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164354, + "balance_loss_mlp": 1.12825704, + "epoch": 0.04713351288957291, + "flos": 863768987136.0, + "grad_norm": 0.059856394455686884, + "language_loss": 0.9987036, + "learning_rate": 0.000999231395741385, + "loss": 1.01034713, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.36108398, + "step": 245, + "time_per_iteration": 3.1183571815490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165942, + "balance_loss_mlp": 1.13215792, + "epoch": 0.04732589457483648, + "flos": 537215390208.0, + "grad_norm": 0.0943556711706318, + "language_loss": 0.97312224, + "learning_rate": 0.0009992140312537557, + "loss": 0.98478168, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.33789062, + "step": 246, + "time_per_iteration": 2.6516497135162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144759, + "balance_loss_mlp": 1.11121345, + "epoch": 0.04751827626010004, + "flos": 761906870784.0, + "grad_norm": 0.07660143361567079, + "language_loss": 0.93426013, + "learning_rate": 0.000999196472954051, + "loss": 0.94570768, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.33569336, + "step": 247, + "time_per_iteration": 2.975703477859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.06084414, + "balance_loss_mlp": 5.7578764, + "epoch": 0.0477106579453636, + "flos": 1579791859200.0, + "grad_norm": 0.5887991941215185, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.85509264, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 3.265625, + "step": 248, + "time_per_iteration": 5.566707372665405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147412, + "balance_loss_mlp": 1.11617875, + "epoch": 0.04790303963062716, + "flos": 457766111232.0, + "grad_norm": 0.08054908277290292, + "language_loss": 0.99819887, + "learning_rate": 0.0009991607749457578, + "loss": 1.00967312, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.31225586, + "step": 249, + "time_per_iteration": 2.601257801055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179313, + "balance_loss_mlp": 1.14769912, + "epoch": 0.04809542131589073, + "flos": 782419668480.0, + "grad_norm": 0.0802426637388702, + "language_loss": 0.97979879, + "learning_rate": 0.0009991426352510286, + "loss": 0.99159187, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.31591797, + "step": 250, + "time_per_iteration": 3.036884069442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221052, + "balance_loss_mlp": 1.18660045, + "epoch": 0.04828780300115429, + "flos": 559260039168.0, + "grad_norm": 0.10047763480801107, + "language_loss": 0.99211901, + "learning_rate": 0.0009991243017719422, + "loss": 1.00432956, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.34448242, + "step": 251, + "time_per_iteration": 2.6728298664093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221015, + "balance_loss_mlp": 1.18696856, + "epoch": 0.048480184686417856, + "flos": 501929561088.0, + "grad_norm": 0.09158100422304945, + "language_loss": 0.93989825, + "learning_rate": 0.0009991057745156165, + "loss": 0.95210844, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.34033203, + "step": 252, + "time_per_iteration": 2.6554462909698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03874573, + "balance_loss_mlp": 3.65637207, + "epoch": 0.048672566371681415, + "flos": 1536360016896.0, + "grad_norm": 0.4297237142905687, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.85785556, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 2.1875, + "step": 253, + "time_per_iteration": 5.027901649475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243575, + "balance_loss_mlp": 1.20886147, + "epoch": 0.04886494805694498, + "flos": 537922031616.0, + "grad_norm": 0.13167813172379958, + "language_loss": 1.02751815, + "learning_rate": 0.0009990681387000943, + "loss": 1.03995395, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.34716797, + "step": 254, + "time_per_iteration": 2.830775260925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287984, + "balance_loss_mlp": 1.25019443, + "epoch": 0.04905732974220854, + "flos": 680169540096.0, + "grad_norm": 0.07749867859212424, + "language_loss": 0.9817788, + "learning_rate": 0.0009990490301555093, + "loss": 0.99465859, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.37792969, + "step": 255, + "time_per_iteration": 2.9786195755004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04361559, + "balance_loss_mlp": 4.02739191, + "epoch": 0.04924971142747211, + "flos": 1421179997184.0, + "grad_norm": 0.4777758897592442, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.83576715, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 3.34375, + "step": 256, + "time_per_iteration": 4.883893013000488 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03108787, + "balance_loss_mlp": 2.91576314, + "epoch": 0.04944209311273567, + "flos": 1558006742016.0, + "grad_norm": 0.20418401203526695, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.8235153, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 1.9296875, + "step": 257, + "time_per_iteration": 4.981754541397095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03419801, + "balance_loss_mlp": 3.31070042, + "epoch": 0.04963447479799923, + "flos": 1570820262912.0, + "grad_norm": 0.4614192090098086, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.73395681, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 1.09375, + "step": 258, + "time_per_iteration": 4.904312372207642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0147617, + "balance_loss_mlp": 1.43730807, + "epoch": 0.049826856483262794, + "flos": 625349569536.0, + "grad_norm": 0.19757670960702672, + "language_loss": 0.92998719, + "learning_rate": 0.0009989706585723202, + "loss": 0.94474888, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.38867188, + "step": 259, + "time_per_iteration": 2.8021159172058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01539233, + "balance_loss_mlp": 1.49808145, + "epoch": 0.05001923816852635, + "flos": 504160713216.0, + "grad_norm": 0.19510301282339976, + "language_loss": 0.99383926, + "learning_rate": 0.0009989505813633442, + "loss": 1.00923157, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.41137695, + "step": 260, + "time_per_iteration": 2.6653668880462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01478791, + "balance_loss_mlp": 1.4348743, + "epoch": 0.05021161985378992, + "flos": 587345831424.0, + "grad_norm": 0.10786879930825251, + "language_loss": 0.98759341, + "learning_rate": 0.000998930310444573, + "loss": 1.00238132, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.43920898, + "step": 261, + "time_per_iteration": 2.7604081630706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432802, + "balance_loss_mlp": 1.38426006, + "epoch": 0.05040400153905348, + "flos": 633303235584.0, + "grad_norm": 0.09058377349206405, + "language_loss": 0.96455801, + "learning_rate": 0.0009989098458238765, + "loss": 0.97888601, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.4855957, + "step": 262, + "time_per_iteration": 2.8061673641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01428574, + "balance_loss_mlp": 1.3737855, + "epoch": 0.050596383224317046, + "flos": 553636270080.0, + "grad_norm": 0.09431506628041801, + "language_loss": 0.959288, + "learning_rate": 0.0009988891875091998, + "loss": 0.9735738, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.54833984, + "step": 263, + "time_per_iteration": 2.756467819213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0142654, + "balance_loss_mlp": 1.36974835, + "epoch": 0.050788764909580605, + "flos": 549663293952.0, + "grad_norm": 0.10391907645657336, + "language_loss": 0.90729272, + "learning_rate": 0.0009988683355085636, + "loss": 0.92155808, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.56787109, + "step": 264, + "time_per_iteration": 2.7685976028442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01420401, + "balance_loss_mlp": 1.3644681, + "epoch": 0.05098114659484417, + "flos": 605118325248.0, + "grad_norm": 0.09802606586789958, + "language_loss": 0.99670649, + "learning_rate": 0.000998847289830063, + "loss": 1.01091051, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.55957031, + "step": 265, + "time_per_iteration": 2.831874132156372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01390772, + "balance_loss_mlp": 1.34082305, + "epoch": 0.05117352828010773, + "flos": 438548027904.0, + "grad_norm": 0.13175698376961376, + "language_loss": 0.92018604, + "learning_rate": 0.0009988260504818682, + "loss": 0.93409377, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.49926758, + "step": 266, + "time_per_iteration": 2.5666043758392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364075, + "balance_loss_mlp": 1.31720233, + "epoch": 0.0513659099653713, + "flos": 505032910848.0, + "grad_norm": 0.11617121831129276, + "language_loss": 0.98586178, + "learning_rate": 0.000998804617472226, + "loss": 0.99950248, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.46899414, + "step": 267, + "time_per_iteration": 2.683875322341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339401, + "balance_loss_mlp": 1.29844046, + "epoch": 0.05155829165063486, + "flos": 695488799232.0, + "grad_norm": 0.13482471872787388, + "language_loss": 0.93566334, + "learning_rate": 0.0009987829908094568, + "loss": 0.94905734, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.40966797, + "step": 268, + "time_per_iteration": 2.844641923904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270506, + "balance_loss_mlp": 1.23007023, + "epoch": 0.051750673335898424, + "flos": 1348260111360.0, + "grad_norm": 0.10753691268900553, + "language_loss": 1.00233316, + "learning_rate": 0.0009987611705019569, + "loss": 1.01503825, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.40454102, + "step": 269, + "time_per_iteration": 4.188141107559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223109, + "balance_loss_mlp": 1.1811955, + "epoch": 0.051943055021161984, + "flos": 489607566336.0, + "grad_norm": 0.09459011438584931, + "language_loss": 0.9928273, + "learning_rate": 0.0009987391565581978, + "loss": 1.00505841, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.41943359, + "step": 270, + "time_per_iteration": 2.603743076324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187631, + "balance_loss_mlp": 1.14400077, + "epoch": 0.05213543670642555, + "flos": 545779150848.0, + "grad_norm": 0.06481058483540457, + "language_loss": 0.91893035, + "learning_rate": 0.000998716948986726, + "loss": 0.93080664, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.43652344, + "step": 271, + "time_per_iteration": 2.8780717849731445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189162, + "balance_loss_mlp": 1.14545989, + "epoch": 0.05232781839168911, + "flos": 603561881088.0, + "grad_norm": 0.0816946734367831, + "language_loss": 0.93787229, + "learning_rate": 0.0009986945477961633, + "loss": 0.94976389, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.4375, + "step": 272, + "time_per_iteration": 2.723017692565918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181133, + "balance_loss_mlp": 1.13828969, + "epoch": 0.052520200076952676, + "flos": 538504962048.0, + "grad_norm": 0.0734282707774283, + "language_loss": 0.99389303, + "learning_rate": 0.0009986719529952066, + "loss": 1.00570428, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.4284668, + "step": 273, + "time_per_iteration": 2.8852784633636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175268, + "balance_loss_mlp": 1.13082659, + "epoch": 0.052712581762216236, + "flos": 463384737792.0, + "grad_norm": 0.10629611668364672, + "language_loss": 0.98564589, + "learning_rate": 0.000998649164592628, + "loss": 0.99739856, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.44458008, + "step": 274, + "time_per_iteration": 2.616504430770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151835, + "balance_loss_mlp": 1.1077987, + "epoch": 0.0529049634474798, + "flos": 548020214784.0, + "grad_norm": 0.10641977070505904, + "language_loss": 0.95747149, + "learning_rate": 0.0009986261825972748, + "loss": 0.96898991, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.44018555, + "step": 275, + "time_per_iteration": 2.7185463905334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170515, + "balance_loss_mlp": 1.12447667, + "epoch": 0.05309734513274336, + "flos": 618021052416.0, + "grad_norm": 0.09271858345864015, + "language_loss": 0.98292786, + "learning_rate": 0.000998603007018069, + "loss": 0.99463308, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.46044922, + "step": 276, + "time_per_iteration": 2.884373188018799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120113, + "balance_loss_mlp": 1.15065718, + "epoch": 0.05328972681800693, + "flos": 605498996736.0, + "grad_norm": 0.06824174267425122, + "language_loss": 0.95424223, + "learning_rate": 0.0009985796378640089, + "loss": 0.96625352, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.50512695, + "step": 277, + "time_per_iteration": 2.766671895980835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196508, + "balance_loss_mlp": 1.14670205, + "epoch": 0.05348210850327049, + "flos": 604503088128.0, + "grad_norm": 0.07462742938020851, + "language_loss": 0.95504081, + "learning_rate": 0.0009985560751441665, + "loss": 0.96700585, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.49829102, + "step": 278, + "time_per_iteration": 2.8290188312530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202501, + "balance_loss_mlp": 1.1519084, + "epoch": 0.053674490188534055, + "flos": 630782816256.0, + "grad_norm": 0.08249787624351518, + "language_loss": 0.97367889, + "learning_rate": 0.00099853231886769, + "loss": 0.98570395, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.5065918, + "step": 279, + "time_per_iteration": 2.7985732555389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208136, + "balance_loss_mlp": 1.15880692, + "epoch": 0.053866871873797614, + "flos": 479185611264.0, + "grad_norm": 0.06817333546872655, + "language_loss": 0.98251152, + "learning_rate": 0.0009985083690438024, + "loss": 0.99459285, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.4934082, + "step": 280, + "time_per_iteration": 2.711107015609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120231, + "balance_loss_mlp": 1.15419662, + "epoch": 0.054059253559061174, + "flos": 788035723776.0, + "grad_norm": 0.06285675396315912, + "language_loss": 0.88899338, + "learning_rate": 0.0009984842256818016, + "loss": 0.90101647, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.48095703, + "step": 281, + "time_per_iteration": 3.089395761489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118336, + "balance_loss_mlp": 1.13934779, + "epoch": 0.05425163524432474, + "flos": 628361515008.0, + "grad_norm": 0.09184892817545263, + "language_loss": 0.99464393, + "learning_rate": 0.0009984598887910613, + "loss": 1.00647748, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.43994141, + "step": 282, + "time_per_iteration": 2.809372663497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193758, + "balance_loss_mlp": 1.14736223, + "epoch": 0.0544440169295883, + "flos": 615760164864.0, + "grad_norm": 0.0862697219544723, + "language_loss": 0.95099992, + "learning_rate": 0.0009984353583810297, + "loss": 0.96293747, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.46386719, + "step": 283, + "time_per_iteration": 2.887547016143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174794, + "balance_loss_mlp": 1.12997127, + "epoch": 0.05463639861485187, + "flos": 647762406912.0, + "grad_norm": 0.07077343192171563, + "language_loss": 0.96608889, + "learning_rate": 0.0009984106344612302, + "loss": 0.97783673, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.44799805, + "step": 284, + "time_per_iteration": 2.7930290699005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158485, + "balance_loss_mlp": 1.11640382, + "epoch": 0.054828780300115426, + "flos": 797192699904.0, + "grad_norm": 0.07340568947827376, + "language_loss": 0.92955279, + "learning_rate": 0.0009983857170412615, + "loss": 0.94113761, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.42089844, + "step": 285, + "time_per_iteration": 3.0093743801116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165069, + "balance_loss_mlp": 1.1219871, + "epoch": 0.05502116198537899, + "flos": 549690458112.0, + "grad_norm": 0.05960836075086468, + "language_loss": 0.92676461, + "learning_rate": 0.000998360606130798, + "loss": 0.93841541, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.43041992, + "step": 286, + "time_per_iteration": 2.837170362472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03710432, + "balance_loss_mlp": 3.53495646, + "epoch": 0.05521354367064255, + "flos": 1407753437184.0, + "grad_norm": 0.1985650778679295, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.72783548, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 1.7578125, + "step": 287, + "time_per_iteration": 4.929426908493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157732, + "balance_loss_mlp": 1.11290884, + "epoch": 0.05540592535590612, + "flos": 645420026880.0, + "grad_norm": 0.08938470510968509, + "language_loss": 0.98063755, + "learning_rate": 0.0009983098038774552, + "loss": 0.99221486, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.44799805, + "step": 288, + "time_per_iteration": 2.8677265644073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03445158, + "balance_loss_mlp": 3.31088066, + "epoch": 0.05559830704116968, + "flos": 1511095647744.0, + "grad_norm": 0.2206810579053755, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.81615388, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 1.34375, + "step": 289, + "time_per_iteration": 4.795354604721069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204321, + "balance_loss_mlp": 1.15992737, + "epoch": 0.055790688726433245, + "flos": 508328980992.0, + "grad_norm": 0.08343344919246831, + "language_loss": 0.96212429, + "learning_rate": 0.0009982582277800948, + "loss": 0.97416747, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.44360352, + "step": 290, + "time_per_iteration": 2.610515832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201009, + "balance_loss_mlp": 1.15659118, + "epoch": 0.055983070411696804, + "flos": 657870501888.0, + "grad_norm": 0.09373610552028779, + "language_loss": 1.02980018, + "learning_rate": 0.0009982321495648908, + "loss": 1.04181027, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.4440918, + "step": 291, + "time_per_iteration": 2.847222089767456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213433, + "balance_loss_mlp": 1.16884899, + "epoch": 0.05617545209696037, + "flos": 587335919616.0, + "grad_norm": 0.12267020035660053, + "language_loss": 0.94884562, + "learning_rate": 0.0009982058779188115, + "loss": 0.96097994, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.44604492, + "step": 292, + "time_per_iteration": 2.7585439682006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190958, + "balance_loss_mlp": 1.14596868, + "epoch": 0.05636783378222393, + "flos": 611621632512.0, + "grad_norm": 0.07287048907504978, + "language_loss": 1.01494539, + "learning_rate": 0.0009981794128520567, + "loss": 1.02685499, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.44970703, + "step": 293, + "time_per_iteration": 2.8542449474334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194856, + "balance_loss_mlp": 1.14817381, + "epoch": 0.0565602154674875, + "flos": 668161405440.0, + "grad_norm": 0.060100997943795566, + "language_loss": 0.98246396, + "learning_rate": 0.000998152754374901, + "loss": 0.99441248, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.46704102, + "step": 294, + "time_per_iteration": 2.897792100906372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183893, + "balance_loss_mlp": 1.13856936, + "epoch": 0.05675259715275106, + "flos": 617242830336.0, + "grad_norm": 0.0698691020933478, + "language_loss": 0.94496101, + "learning_rate": 0.0009981259024976943, + "loss": 0.95679998, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.453125, + "step": 295, + "time_per_iteration": 2.7404842376708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186618, + "balance_loss_mlp": 1.14067447, + "epoch": 0.05694497883801462, + "flos": 751769040384.0, + "grad_norm": 0.10167990029855892, + "language_loss": 0.92340136, + "learning_rate": 0.0009980988572308612, + "loss": 0.93526757, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.45922852, + "step": 296, + "time_per_iteration": 3.007516384124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169257, + "balance_loss_mlp": 1.12450624, + "epoch": 0.05713736052327818, + "flos": 712010995200.0, + "grad_norm": 0.07320939901236567, + "language_loss": 0.95507723, + "learning_rate": 0.0009980716185849015, + "loss": 0.96676981, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.44775391, + "step": 297, + "time_per_iteration": 2.9953107833862305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163884, + "balance_loss_mlp": 1.12180316, + "epoch": 0.05732974220854175, + "flos": 468976200192.0, + "grad_norm": 0.06695295039959538, + "language_loss": 0.92045325, + "learning_rate": 0.0009980441865703904, + "loss": 0.93209207, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.4206543, + "step": 298, + "time_per_iteration": 2.6119296550750732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149691, + "balance_loss_mlp": 1.10896909, + "epoch": 0.05752212389380531, + "flos": 601422133248.0, + "grad_norm": 0.07389257813376128, + "language_loss": 1.00092888, + "learning_rate": 0.000998016561197978, + "loss": 1.0124259, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.40698242, + "step": 299, + "time_per_iteration": 2.776057004928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139916, + "balance_loss_mlp": 1.10072017, + "epoch": 0.057714505579068875, + "flos": 678664852992.0, + "grad_norm": 0.08850581007108178, + "language_loss": 0.91981971, + "learning_rate": 0.0009979887424783895, + "loss": 0.93121886, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.39208984, + "step": 300, + "time_per_iteration": 2.9253783226013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114416, + "balance_loss_mlp": 1.10362935, + "epoch": 0.057906887264332435, + "flos": 595884999168.0, + "grad_norm": 0.06286570971506464, + "language_loss": 0.91965425, + "learning_rate": 0.0009979607304224248, + "loss": 0.93109584, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.40527344, + "step": 301, + "time_per_iteration": 2.7880210876464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148274, + "balance_loss_mlp": 1.10626435, + "epoch": 0.058099268949596, + "flos": 552116901888.0, + "grad_norm": 0.07282163575611278, + "language_loss": 0.98193479, + "learning_rate": 0.000997932525040959, + "loss": 0.9934175, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.41992188, + "step": 302, + "time_per_iteration": 2.6913211345672607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135024, + "balance_loss_mlp": 1.09647226, + "epoch": 0.05829165063485956, + "flos": 508170765312.0, + "grad_norm": 0.08010118219410382, + "language_loss": 1.00433981, + "learning_rate": 0.000997904126344943, + "loss": 1.01569009, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.38549805, + "step": 303, + "time_per_iteration": 2.648486375808716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152351, + "balance_loss_mlp": 1.112535, + "epoch": 0.05848403232012313, + "flos": 615231562752.0, + "grad_norm": 0.07274054196633538, + "language_loss": 0.95938694, + "learning_rate": 0.0009978755343454018, + "loss": 0.97091049, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.39794922, + "step": 304, + "time_per_iteration": 2.7488231658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162494, + "balance_loss_mlp": 1.12279713, + "epoch": 0.05867641400538669, + "flos": 500083849728.0, + "grad_norm": 0.07785655900909055, + "language_loss": 0.97099572, + "learning_rate": 0.0009978467490534355, + "loss": 0.98262066, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.39697266, + "step": 305, + "time_per_iteration": 2.5928122997283936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161095, + "balance_loss_mlp": 1.12101698, + "epoch": 0.05886879569065025, + "flos": 531290244096.0, + "grad_norm": 0.06710807116161162, + "language_loss": 0.94506705, + "learning_rate": 0.00099781777048022, + "loss": 0.95667803, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.40087891, + "step": 306, + "time_per_iteration": 2.7071874141693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166562, + "balance_loss_mlp": 1.12727094, + "epoch": 0.05906117737591381, + "flos": 489056569344.0, + "grad_norm": 0.06805578843696672, + "language_loss": 0.95336848, + "learning_rate": 0.0009977885986370057, + "loss": 0.96503407, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.39282227, + "step": 307, + "time_per_iteration": 2.560727119445801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181694, + "balance_loss_mlp": 1.14190209, + "epoch": 0.05925355906117737, + "flos": 591511527936.0, + "grad_norm": 0.07408509854998435, + "language_loss": 0.92084455, + "learning_rate": 0.000997759233535118, + "loss": 0.93266147, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.39770508, + "step": 308, + "time_per_iteration": 2.811706304550171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199188, + "balance_loss_mlp": 1.15813279, + "epoch": 0.05944594074644094, + "flos": 563655532032.0, + "grad_norm": 0.11332767927985109, + "language_loss": 0.97065681, + "learning_rate": 0.0009977296751859576, + "loss": 0.98264867, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.41040039, + "step": 309, + "time_per_iteration": 2.8100500106811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182124, + "balance_loss_mlp": 1.14152098, + "epoch": 0.0596383224317045, + "flos": 538747241472.0, + "grad_norm": 0.06886541031259097, + "language_loss": 0.99580777, + "learning_rate": 0.0009976999236009998, + "loss": 1.00762904, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.40576172, + "step": 310, + "time_per_iteration": 2.7856838703155518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116778, + "balance_loss_mlp": 1.12984788, + "epoch": 0.059830704116968066, + "flos": 560957446656.0, + "grad_norm": 0.07671642451322926, + "language_loss": 1.00938904, + "learning_rate": 0.0009976699787917955, + "loss": 1.02106678, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.37939453, + "step": 311, + "time_per_iteration": 2.679760217666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01912771, + "balance_loss_mlp": 1.87653184, + "epoch": 0.060023085802231625, + "flos": 1570615059456.0, + "grad_norm": 0.11004817833063929, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.75355768, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.36328125, + "step": 312, + "time_per_iteration": 5.035902976989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167025, + "balance_loss_mlp": 1.12742412, + "epoch": 0.06021546748749519, + "flos": 482657149440.0, + "grad_norm": 0.08745367830689305, + "language_loss": 0.92707014, + "learning_rate": 0.0009976095095472243, + "loss": 0.93874037, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.39575195, + "step": 313, + "time_per_iteration": 2.606323480606079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137298, + "balance_loss_mlp": 1.10091519, + "epoch": 0.06040784917275875, + "flos": 620195304960.0, + "grad_norm": 0.07680079441574393, + "language_loss": 0.94012022, + "learning_rate": 0.0009975789851353334, + "loss": 0.95149314, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.36352539, + "step": 314, + "time_per_iteration": 2.838961362838745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135404, + "balance_loss_mlp": 1.10076201, + "epoch": 0.06060023085802232, + "flos": 483553939968.0, + "grad_norm": 0.07916345547758051, + "language_loss": 0.96821368, + "learning_rate": 0.0009975482675461487, + "loss": 0.97956777, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.34643555, + "step": 315, + "time_per_iteration": 2.6935253143310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122263, + "balance_loss_mlp": 1.08905149, + "epoch": 0.06079261254328588, + "flos": 581892761088.0, + "grad_norm": 0.06025100036286014, + "language_loss": 0.94348001, + "learning_rate": 0.0009975173567915952, + "loss": 0.95470262, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.33203125, + "step": 316, + "time_per_iteration": 2.784148931503296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123821, + "balance_loss_mlp": 1.08903599, + "epoch": 0.060984994228549444, + "flos": 687794664960.0, + "grad_norm": 0.06288070363718151, + "language_loss": 0.8781901, + "learning_rate": 0.000997486252883674, + "loss": 0.88942832, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.34765625, + "step": 317, + "time_per_iteration": 2.8335070610046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130287, + "balance_loss_mlp": 1.09628844, + "epoch": 0.061177375913813004, + "flos": 1314775577088.0, + "grad_norm": 0.08951651385504938, + "language_loss": 0.93891156, + "learning_rate": 0.0009974549558344602, + "loss": 0.95021445, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.33984375, + "step": 318, + "time_per_iteration": 3.661447048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140147, + "balance_loss_mlp": 1.10564828, + "epoch": 0.06136975759907657, + "flos": 574337018880.0, + "grad_norm": 0.0956877361367619, + "language_loss": 1.0199635, + "learning_rate": 0.000997423465656105, + "loss": 1.03136492, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.34521484, + "step": 319, + "time_per_iteration": 2.7822437286376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124104, + "balance_loss_mlp": 1.08896148, + "epoch": 0.06156213928434013, + "flos": 527537152512.0, + "grad_norm": 0.10289983756800847, + "language_loss": 0.99710345, + "learning_rate": 0.0009973917823608335, + "loss": 1.00834441, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.3515625, + "step": 320, + "time_per_iteration": 2.631345272064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135481, + "balance_loss_mlp": 1.09964669, + "epoch": 0.061754520969603696, + "flos": 495507746304.0, + "grad_norm": 0.0680580088694669, + "language_loss": 0.95663267, + "learning_rate": 0.0009973599059609462, + "loss": 0.96798748, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.35839844, + "step": 321, + "time_per_iteration": 2.7266485691070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117054, + "balance_loss_mlp": 1.13201189, + "epoch": 0.061946902654867256, + "flos": 440079879168.0, + "grad_norm": 0.07460436538347456, + "language_loss": 0.9288404, + "learning_rate": 0.000997327836468819, + "loss": 0.9405458, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.38525391, + "step": 322, + "time_per_iteration": 2.673107385635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179324, + "balance_loss_mlp": 1.14246416, + "epoch": 0.06213928434013082, + "flos": 598800397824.0, + "grad_norm": 0.08768405045584388, + "language_loss": 0.95868701, + "learning_rate": 0.000997295573896902, + "loss": 0.9704802, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.36865234, + "step": 323, + "time_per_iteration": 2.89715838432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01974818, + "balance_loss_mlp": 1.93609941, + "epoch": 0.06233166602539438, + "flos": 1450135789056.0, + "grad_norm": 0.15129070182137194, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83170861, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.38671875, + "step": 324, + "time_per_iteration": 4.777086496353149 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01725792, + "balance_loss_mlp": 1.68592823, + "epoch": 0.06252404771065795, + "flos": 1463327036928.0, + "grad_norm": 0.07336881302622408, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80297732, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.3984375, + "step": 325, + "time_per_iteration": 4.8852620124816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203893, + "balance_loss_mlp": 1.16684282, + "epoch": 0.06271642939592151, + "flos": 464294011392.0, + "grad_norm": 0.09305137701088195, + "language_loss": 0.90879977, + "learning_rate": 0.000997197627828043, + "loss": 0.92083865, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.37060547, + "step": 326, + "time_per_iteration": 2.615715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198178, + "balance_loss_mlp": 1.16174805, + "epoch": 0.06290881108118507, + "flos": 532374985728.0, + "grad_norm": 0.10757754770856821, + "language_loss": 0.86059356, + "learning_rate": 0.0009971645930629716, + "loss": 0.8725754, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.36450195, + "step": 327, + "time_per_iteration": 2.7753512859344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193235, + "balance_loss_mlp": 1.15790117, + "epoch": 0.06310119276644863, + "flos": 673562718720.0, + "grad_norm": 0.0829627430200847, + "language_loss": 0.98908973, + "learning_rate": 0.0009971313652814872, + "loss": 1.00102198, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.35351562, + "step": 328, + "time_per_iteration": 2.8697071075439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183904, + "balance_loss_mlp": 1.14957154, + "epoch": 0.0632935744517122, + "flos": 770732734464.0, + "grad_norm": 0.07808148320278054, + "language_loss": 0.9654116, + "learning_rate": 0.0009970979444964903, + "loss": 0.97725058, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.34350586, + "step": 329, + "time_per_iteration": 3.013674259185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179752, + "balance_loss_mlp": 1.14446568, + "epoch": 0.06348595613697576, + "flos": 561913708032.0, + "grad_norm": 0.08385245466398004, + "language_loss": 0.97686106, + "learning_rate": 0.0009970643307209556, + "loss": 0.98865855, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.3527832, + "step": 330, + "time_per_iteration": 2.868323802947998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168029, + "balance_loss_mlp": 1.13097858, + "epoch": 0.06367833782223932, + "flos": 676189223424.0, + "grad_norm": 0.08206463725837071, + "language_loss": 0.93874633, + "learning_rate": 0.0009970305239679334, + "loss": 0.95042664, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.37060547, + "step": 331, + "time_per_iteration": 2.8225202560424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178202, + "balance_loss_mlp": 1.14210534, + "epoch": 0.06387071950750288, + "flos": 495297773568.0, + "grad_norm": 0.07579712662789459, + "language_loss": 0.98774493, + "learning_rate": 0.0009969965242505483, + "loss": 0.99952692, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.36108398, + "step": 332, + "time_per_iteration": 2.8107545375823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168447, + "balance_loss_mlp": 1.13325644, + "epoch": 0.06406310119276645, + "flos": 533447244288.0, + "grad_norm": 0.07917811788525977, + "language_loss": 0.94783902, + "learning_rate": 0.0009969623315820007, + "loss": 0.95952344, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.35180664, + "step": 333, + "time_per_iteration": 2.698505401611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171432, + "balance_loss_mlp": 1.13636017, + "epoch": 0.06425548287803001, + "flos": 456184700928.0, + "grad_norm": 0.0763666551446786, + "language_loss": 0.95210952, + "learning_rate": 0.000996927945975565, + "loss": 0.96382385, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.35083008, + "step": 334, + "time_per_iteration": 2.584472894668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115222, + "balance_loss_mlp": 1.11686206, + "epoch": 0.06444786456329357, + "flos": 560077908480.0, + "grad_norm": 0.08033900057819754, + "language_loss": 0.91956127, + "learning_rate": 0.0009968933674445906, + "loss": 0.93108344, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.35375977, + "step": 335, + "time_per_iteration": 2.689556837081909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114559, + "balance_loss_mlp": 1.11109114, + "epoch": 0.06464024624855713, + "flos": 666085897728.0, + "grad_norm": 0.06825993333788044, + "language_loss": 0.94537115, + "learning_rate": 0.0009968585960025028, + "loss": 0.95682704, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.34521484, + "step": 336, + "time_per_iteration": 3.009956121444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02330067, + "balance_loss_mlp": 2.29764199, + "epoch": 0.0648326279338207, + "flos": 1521371870208.0, + "grad_norm": 0.13230953132672904, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.79983252, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.32421875, + "step": 337, + "time_per_iteration": 4.800926685333252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126942, + "balance_loss_mlp": 1.09404051, + "epoch": 0.06502500961908426, + "flos": 1143339909120.0, + "grad_norm": 0.06377137616855041, + "language_loss": 0.92515147, + "learning_rate": 0.0009967884744390583, + "loss": 0.93642092, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.32910156, + "step": 338, + "time_per_iteration": 3.5464487075805664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124603, + "balance_loss_mlp": 1.09043801, + "epoch": 0.06521739130434782, + "flos": 582609314304.0, + "grad_norm": 0.0855348813631026, + "language_loss": 0.93111128, + "learning_rate": 0.0009967531243449256, + "loss": 0.9423573, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.34155273, + "step": 339, + "time_per_iteration": 2.6777007579803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131136, + "balance_loss_mlp": 1.09642255, + "epoch": 0.06540977298961138, + "flos": 497650065408.0, + "grad_norm": 0.07604626819248426, + "language_loss": 1.00833654, + "learning_rate": 0.000996717581394126, + "loss": 1.01964784, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.34741211, + "step": 340, + "time_per_iteration": 2.6667256355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145305, + "balance_loss_mlp": 1.10975671, + "epoch": 0.06560215467487496, + "flos": 542871092736.0, + "grad_norm": 0.07959679456110856, + "language_loss": 1.00992751, + "learning_rate": 0.000996681845600459, + "loss": 1.02138054, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.35571289, + "step": 341, + "time_per_iteration": 2.6750872135162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168019, + "balance_loss_mlp": 1.13158822, + "epoch": 0.06579453636013852, + "flos": 413454357504.0, + "grad_norm": 0.07803079751348088, + "language_loss": 0.92980075, + "learning_rate": 0.0009966459169777982, + "loss": 0.94148099, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.36425781, + "step": 342, + "time_per_iteration": 2.5240936279296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186775, + "balance_loss_mlp": 1.14920056, + "epoch": 0.06598691804540208, + "flos": 560618993664.0, + "grad_norm": 0.07114695189108672, + "language_loss": 1.02233219, + "learning_rate": 0.0009966097955400924, + "loss": 1.03419995, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.37597656, + "step": 343, + "time_per_iteration": 2.701003313064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182072, + "balance_loss_mlp": 1.14444947, + "epoch": 0.06617929973066564, + "flos": 572090812416.0, + "grad_norm": 0.07450598076984326, + "language_loss": 0.95542282, + "learning_rate": 0.0009965734813013652, + "loss": 0.96724355, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.3762207, + "step": 344, + "time_per_iteration": 2.823782444000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196951, + "balance_loss_mlp": 1.15773153, + "epoch": 0.06637168141592921, + "flos": 490479763968.0, + "grad_norm": 0.0604450427343926, + "language_loss": 0.97975069, + "learning_rate": 0.0009965369742757151, + "loss": 0.9917202, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.39208984, + "step": 345, + "time_per_iteration": 2.5793161392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222733, + "balance_loss_mlp": 1.18341792, + "epoch": 0.06656406310119277, + "flos": 1079194834944.0, + "grad_norm": 0.07564429768448787, + "language_loss": 0.95189452, + "learning_rate": 0.0009965002744773152, + "loss": 0.96412188, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.39306641, + "step": 346, + "time_per_iteration": 3.5293569564819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225458, + "balance_loss_mlp": 1.18573725, + "epoch": 0.06675644478645633, + "flos": 513680735232.0, + "grad_norm": 0.07389332256160373, + "language_loss": 0.91674209, + "learning_rate": 0.0009964633819204139, + "loss": 0.92899668, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.3972168, + "step": 347, + "time_per_iteration": 2.672184705734253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01858873, + "balance_loss_mlp": 1.81805611, + "epoch": 0.06694882647171989, + "flos": 1447192479744.0, + "grad_norm": 0.060031539331637095, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83659983, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.40820312, + "step": 348, + "time_per_iteration": 4.947252988815308 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01829705, + "balance_loss_mlp": 1.78698003, + "epoch": 0.06714120815698346, + "flos": 1552061772288.0, + "grad_norm": 0.05093987002559095, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76983589, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.42773438, + "step": 349, + "time_per_iteration": 4.949065923690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200476, + "balance_loss_mlp": 1.16268659, + "epoch": 0.06733358984224702, + "flos": 880073869824.0, + "grad_norm": 0.10157855165040833, + "language_loss": 0.91835332, + "learning_rate": 0.000996351547842304, + "loss": 0.93035805, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.37792969, + "step": 350, + "time_per_iteration": 3.195343255996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175391, + "balance_loss_mlp": 1.13869905, + "epoch": 0.06752597152751058, + "flos": 518906580480.0, + "grad_norm": 0.09856595883672854, + "language_loss": 0.90272117, + "learning_rate": 0.0009963138843953744, + "loss": 0.91447508, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.3671875, + "step": 351, + "time_per_iteration": 2.6402506828308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141973, + "balance_loss_mlp": 1.10692537, + "epoch": 0.06771835321277414, + "flos": 539668624896.0, + "grad_norm": 0.061148145233813844, + "language_loss": 0.94241744, + "learning_rate": 0.000996276028262306, + "loss": 0.95383716, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.3503418, + "step": 352, + "time_per_iteration": 2.834099769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112107, + "balance_loss_mlp": 1.08011079, + "epoch": 0.0679107348980377, + "flos": 460666828800.0, + "grad_norm": 0.08429524036953953, + "language_loss": 1.00538242, + "learning_rate": 0.0009962379794577964, + "loss": 1.01650345, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.31982422, + "step": 353, + "time_per_iteration": 2.6607887744903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110445, + "balance_loss_mlp": 1.07780528, + "epoch": 0.06810311658330127, + "flos": 635922026496.0, + "grad_norm": 0.07871401687807635, + "language_loss": 0.91255635, + "learning_rate": 0.000996199737996617, + "loss": 0.92366076, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.32641602, + "step": 354, + "time_per_iteration": 2.977060317993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106148, + "balance_loss_mlp": 1.07484412, + "epoch": 0.06829549826856483, + "flos": 464679452160.0, + "grad_norm": 0.07891213217714192, + "language_loss": 0.99330544, + "learning_rate": 0.0009961613038936149, + "loss": 1.00436699, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.31274414, + "step": 355, + "time_per_iteration": 2.615016222000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097974, + "balance_loss_mlp": 1.06619298, + "epoch": 0.06848787995382839, + "flos": 634647135744.0, + "grad_norm": 0.06589791904701883, + "language_loss": 0.92011106, + "learning_rate": 0.000996122677163711, + "loss": 0.93109083, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.31762695, + "step": 356, + "time_per_iteration": 2.844289541244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110581, + "balance_loss_mlp": 1.07848942, + "epoch": 0.06868026163909195, + "flos": 806374268928.0, + "grad_norm": 0.09636291923958067, + "language_loss": 0.97709715, + "learning_rate": 0.000996083857821902, + "loss": 0.98820293, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.32080078, + "step": 357, + "time_per_iteration": 3.0474655628204346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137059, + "balance_loss_mlp": 1.10334635, + "epoch": 0.06887264332435553, + "flos": 439227505152.0, + "grad_norm": 0.09472058747565097, + "language_loss": 0.95954913, + "learning_rate": 0.0009960448458832588, + "loss": 0.97091973, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.3371582, + "step": 358, + "time_per_iteration": 2.7681682109832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153475, + "balance_loss_mlp": 1.12002492, + "epoch": 0.06906502500961909, + "flos": 484767161856.0, + "grad_norm": 0.10342324791005938, + "language_loss": 0.95369232, + "learning_rate": 0.000996005641362927, + "loss": 0.96522713, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.33447266, + "step": 359, + "time_per_iteration": 2.6423869132995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189921, + "balance_loss_mlp": 1.15472996, + "epoch": 0.06925740669488265, + "flos": 733611105792.0, + "grad_norm": 0.10829219970600838, + "language_loss": 0.98827034, + "learning_rate": 0.0009959662442761274, + "loss": 1.00016952, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.35205078, + "step": 360, + "time_per_iteration": 2.941234827041626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185036, + "balance_loss_mlp": 1.14810538, + "epoch": 0.0694497883801462, + "flos": 552415707648.0, + "grad_norm": 0.0683919199988589, + "language_loss": 0.92245018, + "learning_rate": 0.000995926654638155, + "loss": 0.9343006, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.36938477, + "step": 361, + "time_per_iteration": 2.837684154510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202565, + "balance_loss_mlp": 1.16482282, + "epoch": 0.06964217006540978, + "flos": 678015111168.0, + "grad_norm": 0.0951215156771631, + "language_loss": 0.9350909, + "learning_rate": 0.00099588687246438, + "loss": 0.94711655, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.37719727, + "step": 362, + "time_per_iteration": 2.9100425243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203748, + "balance_loss_mlp": 1.16460001, + "epoch": 0.06983455175067334, + "flos": 524241082368.0, + "grad_norm": 0.11257096193086513, + "language_loss": 1.01560402, + "learning_rate": 0.0009958468977702471, + "loss": 1.02764153, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.39160156, + "step": 363, + "time_per_iteration": 2.6317808628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01643136, + "balance_loss_mlp": 1.57790494, + "epoch": 0.0700269334359369, + "flos": 1576787254272.0, + "grad_norm": 0.0741524690412032, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81377846, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.65234375, + "step": 364, + "time_per_iteration": 4.827174663543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187013, + "balance_loss_mlp": 1.15229964, + "epoch": 0.07021931512120046, + "flos": 1013248839168.0, + "grad_norm": 0.07557653682461403, + "language_loss": 0.89914072, + "learning_rate": 0.0009957663708830612, + "loss": 0.91101086, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.34741211, + "step": 365, + "time_per_iteration": 3.280808448791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201016, + "balance_loss_mlp": 1.16401315, + "epoch": 0.07041169680646403, + "flos": 822983099904.0, + "grad_norm": 0.11033601827195522, + "language_loss": 0.91889954, + "learning_rate": 0.0009957258187212714, + "loss": 0.93090969, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.36987305, + "step": 366, + "time_per_iteration": 3.0436058044433594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01494507, + "balance_loss_mlp": 1.43309093, + "epoch": 0.07060407849172759, + "flos": 1414392938496.0, + "grad_norm": 0.06331255113068197, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80689365, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.61328125, + "step": 367, + "time_per_iteration": 4.807323694229126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209897, + "balance_loss_mlp": 1.17287028, + "epoch": 0.07079646017699115, + "flos": 512909853696.0, + "grad_norm": 0.07799784999620897, + "language_loss": 0.8953917, + "learning_rate": 0.0009956441370400167, + "loss": 0.90749061, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.37036133, + "step": 368, + "time_per_iteration": 2.678028106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218753, + "balance_loss_mlp": 1.18270361, + "epoch": 0.07098884186225471, + "flos": 540501548544.0, + "grad_norm": 0.11766553351136624, + "language_loss": 0.9529528, + "learning_rate": 0.0009956030075522636, + "loss": 0.96514034, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.3605957, + "step": 369, + "time_per_iteration": 2.7700181007385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195721, + "balance_loss_mlp": 1.16050696, + "epoch": 0.07118122354751828, + "flos": 548682439680.0, + "grad_norm": 0.07977968738165528, + "language_loss": 0.95944411, + "learning_rate": 0.0009955616856543587, + "loss": 0.97140133, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.35205078, + "step": 370, + "time_per_iteration": 2.6467819213867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011765, + "balance_loss_mlp": 1.14142823, + "epoch": 0.07137360523278184, + "flos": 620904517632.0, + "grad_norm": 0.07610844541313569, + "language_loss": 0.88427055, + "learning_rate": 0.0009955201713623448, + "loss": 0.89603543, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.35083008, + "step": 371, + "time_per_iteration": 2.8926849365234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262059, + "balance_loss_mlp": 1.21208656, + "epoch": 0.0715659869180454, + "flos": 1502672477184.0, + "grad_norm": 0.04749961224137468, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.77934778, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.5, + "step": 372, + "time_per_iteration": 4.978249549865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137568, + "balance_loss_mlp": 1.10769427, + "epoch": 0.07175836860330896, + "flos": 495493065216.0, + "grad_norm": 0.10296972579398556, + "language_loss": 1.02312946, + "learning_rate": 0.0009954365656605333, + "loss": 1.03450513, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.29882812, + "step": 373, + "time_per_iteration": 2.5930416584014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163699, + "balance_loss_mlp": 1.1306777, + "epoch": 0.07195075028857253, + "flos": 785725650432.0, + "grad_norm": 0.08216565506059122, + "language_loss": 0.94662046, + "learning_rate": 0.0009953944742831947, + "loss": 0.95825744, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.33007812, + "step": 374, + "time_per_iteration": 3.02325701713562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149175, + "balance_loss_mlp": 1.1185143, + "epoch": 0.0721431319738361, + "flos": 593107619328.0, + "grad_norm": 0.11719346683047478, + "language_loss": 0.98373723, + "learning_rate": 0.0009953521905766642, + "loss": 0.99522901, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.30639648, + "step": 375, + "time_per_iteration": 2.972064733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156803, + "balance_loss_mlp": 1.12435448, + "epoch": 0.07233551365909965, + "flos": 548250011136.0, + "grad_norm": 0.06602159555114745, + "language_loss": 0.97082627, + "learning_rate": 0.0009953097145573577, + "loss": 0.98239434, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.32446289, + "step": 376, + "time_per_iteration": 2.6647017002105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183036, + "balance_loss_mlp": 1.14922833, + "epoch": 0.07252789534436321, + "flos": 957568780800.0, + "grad_norm": 0.0696983564537716, + "language_loss": 0.94069874, + "learning_rate": 0.000995267046241766, + "loss": 0.95252913, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.33837891, + "step": 377, + "time_per_iteration": 3.2699291706085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186549, + "balance_loss_mlp": 1.15281284, + "epoch": 0.07272027702962677, + "flos": 507649503744.0, + "grad_norm": 0.08226328739164854, + "language_loss": 0.94401312, + "learning_rate": 0.0009952241856464547, + "loss": 0.95587862, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.33764648, + "step": 378, + "time_per_iteration": 2.6432976722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191312, + "balance_loss_mlp": 1.15698004, + "epoch": 0.07291265871489035, + "flos": 612412337664.0, + "grad_norm": 0.12013480935274141, + "language_loss": 1.00853705, + "learning_rate": 0.0009951811327880632, + "loss": 1.02045012, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.34350586, + "step": 379, + "time_per_iteration": 2.822204828262329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192206, + "balance_loss_mlp": 1.15858889, + "epoch": 0.0731050404001539, + "flos": 495750025728.0, + "grad_norm": 0.08341634879178654, + "language_loss": 0.94250029, + "learning_rate": 0.0009951378876833063, + "loss": 0.95442235, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.3359375, + "step": 380, + "time_per_iteration": 2.630469799041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198017, + "balance_loss_mlp": 1.16311216, + "epoch": 0.07329742208541747, + "flos": 639966956544.0, + "grad_norm": 0.09052350379823415, + "language_loss": 1.00640893, + "learning_rate": 0.0009950944503489736, + "loss": 1.01838911, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.34936523, + "step": 381, + "time_per_iteration": 2.758171796798706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202881, + "balance_loss_mlp": 1.16811991, + "epoch": 0.07348980377068103, + "flos": 816346543104.0, + "grad_norm": 0.08361033479665086, + "language_loss": 0.95423895, + "learning_rate": 0.0009950508208019285, + "loss": 0.96626776, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.34741211, + "step": 382, + "time_per_iteration": 2.9980571269989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187801, + "balance_loss_mlp": 1.15489948, + "epoch": 0.0736821854559446, + "flos": 508640269824.0, + "grad_norm": 0.06841783055573346, + "language_loss": 0.99123466, + "learning_rate": 0.0009950069990591096, + "loss": 1.00311255, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.32910156, + "step": 383, + "time_per_iteration": 2.723439931869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01573707, + "balance_loss_mlp": 1.54185438, + "epoch": 0.07387456714120816, + "flos": 1554648629760.0, + "grad_norm": 0.1397468631511101, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.77975076, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.31835938, + "step": 384, + "time_per_iteration": 4.962388753890991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189426, + "balance_loss_mlp": 1.15592778, + "epoch": 0.07406694882647172, + "flos": 525503490048.0, + "grad_norm": 0.0845037323153299, + "language_loss": 0.92480063, + "learning_rate": 0.0009949187790542777, + "loss": 0.93669498, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.33496094, + "step": 385, + "time_per_iteration": 2.7766611576080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193494, + "balance_loss_mlp": 1.16052091, + "epoch": 0.07425933051173528, + "flos": 497738898432.0, + "grad_norm": 0.0971687641338208, + "language_loss": 0.884184, + "learning_rate": 0.0009948743808265148, + "loss": 0.89611894, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.32983398, + "step": 386, + "time_per_iteration": 2.674055576324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183765, + "balance_loss_mlp": 1.15150666, + "epoch": 0.07445171219699885, + "flos": 505003175424.0, + "grad_norm": 0.07384542423184925, + "language_loss": 0.97962248, + "learning_rate": 0.0009948297904714782, + "loss": 0.9914602, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.32250977, + "step": 387, + "time_per_iteration": 2.698899745941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179922, + "balance_loss_mlp": 1.14680552, + "epoch": 0.07464409388226241, + "flos": 553977294336.0, + "grad_norm": 0.06832562007069648, + "language_loss": 0.90421599, + "learning_rate": 0.0009947850080064796, + "loss": 0.91601527, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.33105469, + "step": 388, + "time_per_iteration": 2.8182406425476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178557, + "balance_loss_mlp": 1.14639437, + "epoch": 0.07483647556752597, + "flos": 776862710784.0, + "grad_norm": 0.06958908790939329, + "language_loss": 0.94972193, + "learning_rate": 0.0009947400334489047, + "loss": 0.96150756, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.3215332, + "step": 389, + "time_per_iteration": 3.0191807746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180436, + "balance_loss_mlp": 1.14767742, + "epoch": 0.07502885725278953, + "flos": 612540817920.0, + "grad_norm": 0.0847539772518024, + "language_loss": 0.86555678, + "learning_rate": 0.0009946948668162145, + "loss": 0.87736106, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.32763672, + "step": 390, + "time_per_iteration": 2.7670745849609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182484, + "balance_loss_mlp": 1.14886689, + "epoch": 0.0752212389380531, + "flos": 688629786624.0, + "grad_norm": 0.08648624436703037, + "language_loss": 0.91666478, + "learning_rate": 0.0009946495081259441, + "loss": 0.92848963, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.33618164, + "step": 391, + "time_per_iteration": 2.8355910778045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168167, + "balance_loss_mlp": 1.13454986, + "epoch": 0.07541362062331666, + "flos": 765699609600.0, + "grad_norm": 0.09254550247646448, + "language_loss": 0.94977629, + "learning_rate": 0.0009946039573957035, + "loss": 0.96145797, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.33618164, + "step": 392, + "time_per_iteration": 2.9576094150543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143715, + "balance_loss_mlp": 1.11300731, + "epoch": 0.07560600230858022, + "flos": 588749202432.0, + "grad_norm": 0.06908129255101257, + "language_loss": 0.91881704, + "learning_rate": 0.000994558214643177, + "loss": 0.93025422, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.30712891, + "step": 393, + "time_per_iteration": 2.757168769836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141782, + "balance_loss_mlp": 1.11102629, + "epoch": 0.07579838399384378, + "flos": 749834496000.0, + "grad_norm": 0.06274973991827922, + "language_loss": 0.93209511, + "learning_rate": 0.000994512279886123, + "loss": 0.94351292, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.30712891, + "step": 394, + "time_per_iteration": 3.1078224182128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134727, + "balance_loss_mlp": 1.10523462, + "epoch": 0.07599076567910736, + "flos": 523457717760.0, + "grad_norm": 0.07515736533799398, + "language_loss": 0.93902445, + "learning_rate": 0.0009944661531423758, + "loss": 0.95037174, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.29492188, + "step": 395, + "time_per_iteration": 2.6783392429351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149746, + "balance_loss_mlp": 1.12061143, + "epoch": 0.07618314736437092, + "flos": 551086488576.0, + "grad_norm": 0.07362715907626581, + "language_loss": 0.91989446, + "learning_rate": 0.000994419834429843, + "loss": 0.93139195, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.29125977, + "step": 396, + "time_per_iteration": 2.6774208545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138515, + "balance_loss_mlp": 1.10887921, + "epoch": 0.07637552904963447, + "flos": 698206708224.0, + "grad_norm": 0.0979297809656427, + "language_loss": 0.95834494, + "learning_rate": 0.0009943733237665069, + "loss": 0.96973014, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.29663086, + "step": 397, + "time_per_iteration": 2.8543148040771484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162601, + "balance_loss_mlp": 1.13260818, + "epoch": 0.07656791073489803, + "flos": 579379682304.0, + "grad_norm": 0.07305506526715269, + "language_loss": 0.95531559, + "learning_rate": 0.0009943266211704248, + "loss": 0.96694154, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.29956055, + "step": 398, + "time_per_iteration": 2.9546711444854736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155699, + "balance_loss_mlp": 1.12427545, + "epoch": 0.0767602924201616, + "flos": 417145780224.0, + "grad_norm": 0.0773299202354709, + "language_loss": 0.97448099, + "learning_rate": 0.000994279726659728, + "loss": 0.98603797, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.31396484, + "step": 399, + "time_per_iteration": 2.51406192779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178721, + "balance_loss_mlp": 1.14610541, + "epoch": 0.07695267410542517, + "flos": 482914109952.0, + "grad_norm": 0.0761918911056457, + "language_loss": 0.9424448, + "learning_rate": 0.0009942326402526231, + "loss": 0.95423204, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.32617188, + "step": 400, + "time_per_iteration": 2.578338146209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175813, + "balance_loss_mlp": 1.14300704, + "epoch": 0.07714505579068873, + "flos": 530998778880.0, + "grad_norm": 0.0730936916243032, + "language_loss": 0.93335903, + "learning_rate": 0.0009941853619673902, + "loss": 0.94511712, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.328125, + "step": 401, + "time_per_iteration": 2.6568922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175231, + "balance_loss_mlp": 1.14356887, + "epoch": 0.07733743747595229, + "flos": 804995490816.0, + "grad_norm": 0.0850905540992329, + "language_loss": 0.95842957, + "learning_rate": 0.0009941378918223844, + "loss": 0.97018182, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.31616211, + "step": 402, + "time_per_iteration": 3.098615884780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204282, + "balance_loss_mlp": 1.17018807, + "epoch": 0.07752981916121585, + "flos": 622476016128.0, + "grad_norm": 0.09392353942632323, + "language_loss": 0.9004057, + "learning_rate": 0.0009940902298360354, + "loss": 0.91244853, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.34130859, + "step": 403, + "time_per_iteration": 2.769843101501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188661, + "balance_loss_mlp": 1.15563989, + "epoch": 0.07772220084647942, + "flos": 728276603904.0, + "grad_norm": 0.0817674600565604, + "language_loss": 0.98311555, + "learning_rate": 0.0009940423760268473, + "loss": 0.99500215, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.33007812, + "step": 404, + "time_per_iteration": 2.8945982456207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187921, + "balance_loss_mlp": 1.15442348, + "epoch": 0.07791458253174298, + "flos": 555412972032.0, + "grad_norm": 0.0859899885976376, + "language_loss": 0.92015374, + "learning_rate": 0.0009939943304133982, + "loss": 0.93203294, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.33496094, + "step": 405, + "time_per_iteration": 2.649461269378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172828, + "balance_loss_mlp": 1.14228618, + "epoch": 0.07810696421700654, + "flos": 553181819904.0, + "grad_norm": 0.07444114263212052, + "language_loss": 0.99398023, + "learning_rate": 0.0009939460930143416, + "loss": 1.00570846, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.30517578, + "step": 406, + "time_per_iteration": 2.667829990386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181986, + "balance_loss_mlp": 1.15091991, + "epoch": 0.0782993459022701, + "flos": 650633389056.0, + "grad_norm": 0.08442820151342731, + "language_loss": 0.93529546, + "learning_rate": 0.0009938976638484043, + "loss": 0.9471153, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.31054688, + "step": 407, + "time_per_iteration": 2.9581079483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184799, + "balance_loss_mlp": 1.15428162, + "epoch": 0.07849172758753367, + "flos": 496172542464.0, + "grad_norm": 0.08907940163556441, + "language_loss": 0.91453135, + "learning_rate": 0.0009938490429343887, + "loss": 0.92637932, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.30493164, + "step": 408, + "time_per_iteration": 2.6066792011260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198916, + "balance_loss_mlp": 1.16708684, + "epoch": 0.07868410927279723, + "flos": 577971542016.0, + "grad_norm": 0.09407218950155852, + "language_loss": 0.92654747, + "learning_rate": 0.0009938002302911709, + "loss": 0.93853664, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.31835938, + "step": 409, + "time_per_iteration": 2.7762253284454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206718, + "balance_loss_mlp": 1.17415047, + "epoch": 0.07887649095806079, + "flos": 522970960896.0, + "grad_norm": 0.10932104394797525, + "language_loss": 0.95012206, + "learning_rate": 0.0009937512259377015, + "loss": 0.96218926, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.32543945, + "step": 410, + "time_per_iteration": 2.7103991508483887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265174, + "balance_loss_mlp": 1.23193812, + "epoch": 0.07906887264332435, + "flos": 557253540864.0, + "grad_norm": 0.08720536696991275, + "language_loss": 0.94637173, + "learning_rate": 0.000993702029893006, + "loss": 0.95902348, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.33251953, + "step": 411, + "time_per_iteration": 2.78560733795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295676, + "balance_loss_mlp": 1.26029515, + "epoch": 0.07926125432858792, + "flos": 821984993280.0, + "grad_norm": 0.11720891364975168, + "language_loss": 0.93816972, + "learning_rate": 0.0009936526421761838, + "loss": 0.95112646, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.35400391, + "step": 412, + "time_per_iteration": 3.049868583679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128071, + "balance_loss_mlp": 1.24611533, + "epoch": 0.07945363601385148, + "flos": 562336224768.0, + "grad_norm": 0.095587468789244, + "language_loss": 0.96658343, + "learning_rate": 0.000993603062806409, + "loss": 0.9793905, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.34619141, + "step": 413, + "time_per_iteration": 2.6881110668182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262528, + "balance_loss_mlp": 1.22843432, + "epoch": 0.07964601769911504, + "flos": 517868826624.0, + "grad_norm": 0.10701391534122558, + "language_loss": 0.98645592, + "learning_rate": 0.0009935532918029298, + "loss": 0.99908125, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.34082031, + "step": 414, + "time_per_iteration": 2.6234540939331055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253352, + "balance_loss_mlp": 1.21847153, + "epoch": 0.0798383993843786, + "flos": 539224086528.0, + "grad_norm": 0.10153449079868698, + "language_loss": 0.92723763, + "learning_rate": 0.0009935033291850694, + "loss": 0.93977106, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.34887695, + "step": 415, + "time_per_iteration": 2.6565287113189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224774, + "balance_loss_mlp": 1.19258738, + "epoch": 0.08003078106964218, + "flos": 485145262080.0, + "grad_norm": 0.09081981361814888, + "language_loss": 0.94647777, + "learning_rate": 0.0009934531749722247, + "loss": 0.95872557, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.32177734, + "step": 416, + "time_per_iteration": 2.6504123210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214952, + "balance_loss_mlp": 1.18243122, + "epoch": 0.08022316275490574, + "flos": 518254267392.0, + "grad_norm": 0.08798076505254328, + "language_loss": 0.92810607, + "learning_rate": 0.0009934028291838672, + "loss": 0.94025552, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.32495117, + "step": 417, + "time_per_iteration": 2.7589621543884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202827, + "balance_loss_mlp": 1.17018712, + "epoch": 0.0804155444401693, + "flos": 494012971008.0, + "grad_norm": 0.08954115452054644, + "language_loss": 0.88617092, + "learning_rate": 0.0009933522918395433, + "loss": 0.8981992, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.32592773, + "step": 418, + "time_per_iteration": 2.713758707046509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01389029, + "balance_loss_mlp": 1.35851097, + "epoch": 0.08060792612543285, + "flos": 1581422455296.0, + "grad_norm": 0.08425204298586858, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.79640126, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.3046875, + "step": 419, + "time_per_iteration": 4.9331464767456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218909, + "balance_loss_mlp": 1.18479085, + "epoch": 0.08080030781069643, + "flos": 525343076352.0, + "grad_norm": 0.11622805941353512, + "language_loss": 1.05362594, + "learning_rate": 0.000993250642561551, + "loss": 1.06581497, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.34106445, + "step": 420, + "time_per_iteration": 2.672421455383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181293, + "balance_loss_mlp": 1.14843905, + "epoch": 0.08099268949595999, + "flos": 546718159872.0, + "grad_norm": 0.10269562775159036, + "language_loss": 0.92318636, + "learning_rate": 0.0009931995306673466, + "loss": 0.93499923, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.32861328, + "step": 421, + "time_per_iteration": 2.7427923679351807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168977, + "balance_loss_mlp": 1.13657558, + "epoch": 0.08118507118122355, + "flos": 510367412736.0, + "grad_norm": 0.11431346275656909, + "language_loss": 0.97376955, + "learning_rate": 0.000993148227296103, + "loss": 0.98545933, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.32397461, + "step": 422, + "time_per_iteration": 2.675947666168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151197, + "balance_loss_mlp": 1.12122786, + "epoch": 0.08137745286648711, + "flos": 720671302656.0, + "grad_norm": 0.0890704687176176, + "language_loss": 0.860506, + "learning_rate": 0.000993096732467738, + "loss": 0.87201798, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.29956055, + "step": 423, + "time_per_iteration": 3.060911178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184994, + "balance_loss_mlp": 1.15089989, + "epoch": 0.08156983455175067, + "flos": 679613773824.0, + "grad_norm": 0.1141297149032614, + "language_loss": 0.91752422, + "learning_rate": 0.0009930450462022435, + "loss": 0.9293741, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.34106445, + "step": 424, + "time_per_iteration": 2.8769121170043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233579, + "balance_loss_mlp": 1.20020068, + "epoch": 0.08176221623701424, + "flos": 1453377157632.0, + "grad_norm": 0.046425192010764525, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80423385, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.33398438, + "step": 425, + "time_per_iteration": 4.897430896759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206307, + "balance_loss_mlp": 1.17078233, + "epoch": 0.0819545979222778, + "flos": 1556602292736.0, + "grad_norm": 0.08757679589662427, + "language_loss": 0.89939743, + "learning_rate": 0.0009929410994402065, + "loss": 0.91146052, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.35522461, + "step": 426, + "time_per_iteration": 3.8015847206115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247446, + "balance_loss_mlp": 1.21072912, + "epoch": 0.08214697960754136, + "flos": 512724473856.0, + "grad_norm": 0.09694830127406533, + "language_loss": 0.94969749, + "learning_rate": 0.0009928888389840196, + "loss": 0.96217191, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.3671875, + "step": 427, + "time_per_iteration": 2.7042434215545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244821, + "balance_loss_mlp": 1.21010745, + "epoch": 0.08233936129280492, + "flos": 595124029440.0, + "grad_norm": 0.09892511285502391, + "language_loss": 0.97471511, + "learning_rate": 0.0009928363871714147, + "loss": 0.98716331, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.34716797, + "step": 428, + "time_per_iteration": 2.6848952770233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01253097, + "balance_loss_mlp": 1.21733463, + "epoch": 0.08253174297806849, + "flos": 572039055360.0, + "grad_norm": 0.08269527052289877, + "language_loss": 0.91760862, + "learning_rate": 0.0009927837440227556, + "loss": 0.9301396, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.35766602, + "step": 429, + "time_per_iteration": 2.838052749633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215238, + "balance_loss_mlp": 1.18357563, + "epoch": 0.08272412466333205, + "flos": 623380147200.0, + "grad_norm": 0.07794556654442977, + "language_loss": 0.88257664, + "learning_rate": 0.0009927309095584798, + "loss": 0.89472902, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.31640625, + "step": 430, + "time_per_iteration": 3.010039806365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212644, + "balance_loss_mlp": 1.18246055, + "epoch": 0.08291650634859561, + "flos": 513994595328.0, + "grad_norm": 0.10632891775269031, + "language_loss": 0.96743113, + "learning_rate": 0.0009926778837991, + "loss": 0.97955757, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.30175781, + "step": 431, + "time_per_iteration": 2.734591484069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182523, + "balance_loss_mlp": 1.15226734, + "epoch": 0.08310888803385917, + "flos": 667365931008.0, + "grad_norm": 0.09435654496071201, + "language_loss": 0.9613564, + "learning_rate": 0.000992624666765202, + "loss": 0.97318161, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.30249023, + "step": 432, + "time_per_iteration": 2.829540252685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164349, + "balance_loss_mlp": 1.13523841, + "epoch": 0.08330126971912274, + "flos": 583293560832.0, + "grad_norm": 0.09286672234440549, + "language_loss": 0.93021452, + "learning_rate": 0.000992571258477447, + "loss": 0.94185793, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.29101562, + "step": 433, + "time_per_iteration": 2.8295536041259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154694, + "balance_loss_mlp": 1.12720466, + "epoch": 0.0834936514043863, + "flos": 561350227968.0, + "grad_norm": 0.10037104501236055, + "language_loss": 0.88638759, + "learning_rate": 0.0009925176589565695, + "loss": 0.89793456, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.27514648, + "step": 434, + "time_per_iteration": 2.8025705814361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164119, + "balance_loss_mlp": 1.13445985, + "epoch": 0.08368603308964986, + "flos": 494519551488.0, + "grad_norm": 0.1154039733497609, + "language_loss": 0.97325677, + "learning_rate": 0.0009924638682233791, + "loss": 0.98489797, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.29663086, + "step": 435, + "time_per_iteration": 2.576300621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175635, + "balance_loss_mlp": 1.14626217, + "epoch": 0.08387841477491342, + "flos": 1389017714688.0, + "grad_norm": 0.058007479940938765, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80740231, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.29296875, + "step": 436, + "time_per_iteration": 4.615980625152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203285, + "balance_loss_mlp": 1.17262459, + "epoch": 0.084070796460177, + "flos": 798984082944.0, + "grad_norm": 0.10734010742427191, + "language_loss": 0.87080061, + "learning_rate": 0.0009923557132036668, + "loss": 0.88283348, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.30664062, + "step": 437, + "time_per_iteration": 3.098910331726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203826, + "balance_loss_mlp": 1.1721158, + "epoch": 0.08426317814544056, + "flos": 558963431424.0, + "grad_norm": 0.10713326361470918, + "language_loss": 0.92728174, + "learning_rate": 0.0009923013489591345, + "loss": 0.93932003, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.31713867, + "step": 438, + "time_per_iteration": 2.7423956394195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198894, + "balance_loss_mlp": 1.16902053, + "epoch": 0.08445555983070412, + "flos": 810421396992.0, + "grad_norm": 0.10035753440716286, + "language_loss": 0.90567303, + "learning_rate": 0.0009922467935862681, + "loss": 0.91766196, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.29882812, + "step": 439, + "time_per_iteration": 3.1101534366607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205509, + "balance_loss_mlp": 1.17477679, + "epoch": 0.08464794151596768, + "flos": 510184604160.0, + "grad_norm": 0.11954514685823285, + "language_loss": 0.93942809, + "learning_rate": 0.0009921920471062478, + "loss": 0.95148319, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.30712891, + "step": 440, + "time_per_iteration": 2.600698947906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120422, + "balance_loss_mlp": 1.1727252, + "epoch": 0.08484032320123125, + "flos": 556413649920.0, + "grad_norm": 0.09556707126690236, + "language_loss": 0.90983319, + "learning_rate": 0.0009921371095403281, + "loss": 0.92187542, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.31518555, + "step": 441, + "time_per_iteration": 2.6733319759368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.19223797, + "epoch": 0.08503270488649481, + "flos": 527354343936.0, + "grad_norm": 0.07797244609311368, + "language_loss": 0.93788469, + "learning_rate": 0.0009920819809098379, + "loss": 0.95012105, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.3137207, + "step": 442, + "time_per_iteration": 2.6183252334594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225883, + "balance_loss_mlp": 1.1949122, + "epoch": 0.08522508657175837, + "flos": 614267960832.0, + "grad_norm": 0.09885461493176821, + "language_loss": 0.89838576, + "learning_rate": 0.0009920266612361798, + "loss": 0.91064465, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.30957031, + "step": 443, + "time_per_iteration": 2.8172709941864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226855, + "balance_loss_mlp": 1.19721913, + "epoch": 0.08541746825702193, + "flos": 619792611840.0, + "grad_norm": 0.0888891387256682, + "language_loss": 0.90358502, + "learning_rate": 0.0009919711505408308, + "loss": 0.91585356, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.29614258, + "step": 444, + "time_per_iteration": 2.8260107040405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210635, + "balance_loss_mlp": 1.17978323, + "epoch": 0.08560984994228549, + "flos": 482914109952.0, + "grad_norm": 0.08298354336382399, + "language_loss": 0.88123727, + "learning_rate": 0.000991915448845342, + "loss": 0.89334357, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.30810547, + "step": 445, + "time_per_iteration": 2.5825653076171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189896, + "balance_loss_mlp": 1.16185772, + "epoch": 0.08580223162754906, + "flos": 517152273408.0, + "grad_norm": 0.079307281997536, + "language_loss": 0.97017783, + "learning_rate": 0.000991859556171339, + "loss": 0.98207676, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.28027344, + "step": 446, + "time_per_iteration": 2.60908579826355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169669, + "balance_loss_mlp": 1.14200044, + "epoch": 0.08599461331281262, + "flos": 531475623936.0, + "grad_norm": 0.12218297197997938, + "language_loss": 0.98194999, + "learning_rate": 0.000991803472540521, + "loss": 0.99364674, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.27648926, + "step": 447, + "time_per_iteration": 2.6712088584899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151986, + "balance_loss_mlp": 1.12646365, + "epoch": 0.08618699499807618, + "flos": 790299182592.0, + "grad_norm": 0.09227172547062512, + "language_loss": 0.94125748, + "learning_rate": 0.0009917471979746615, + "loss": 0.95277739, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.25549316, + "step": 448, + "time_per_iteration": 3.075975179672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168815, + "balance_loss_mlp": 1.1426959, + "epoch": 0.08637937668333974, + "flos": 565988000256.0, + "grad_norm": 0.08941799521583026, + "language_loss": 0.93856514, + "learning_rate": 0.0009916907324956086, + "loss": 0.95025325, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.26123047, + "step": 449, + "time_per_iteration": 2.736283540725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172223, + "balance_loss_mlp": 1.14490044, + "epoch": 0.08657175836860331, + "flos": 445167332352.0, + "grad_norm": 0.10083399298029862, + "language_loss": 0.89324713, + "learning_rate": 0.0009916340761252837, + "loss": 0.90496939, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.27331543, + "step": 450, + "time_per_iteration": 2.7378761768341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159685, + "balance_loss_mlp": 1.13442445, + "epoch": 0.08676414005386687, + "flos": 844148210688.0, + "grad_norm": 0.08549336336253632, + "language_loss": 0.87181985, + "learning_rate": 0.0009915772288856832, + "loss": 0.88341665, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.25268555, + "step": 451, + "time_per_iteration": 3.0766491889953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155631, + "balance_loss_mlp": 1.12976265, + "epoch": 0.08695652173913043, + "flos": 603292437504.0, + "grad_norm": 0.07927995723527953, + "language_loss": 0.88654345, + "learning_rate": 0.000991520190798877, + "loss": 0.89809978, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.2590332, + "step": 452, + "time_per_iteration": 2.838925838470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158093, + "balance_loss_mlp": 1.13122344, + "epoch": 0.08714890342439399, + "flos": 730737552384.0, + "grad_norm": 0.12430534270573573, + "language_loss": 0.96291733, + "learning_rate": 0.0009914629618870089, + "loss": 0.97449821, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.26904297, + "step": 453, + "time_per_iteration": 2.902444839477539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103846, + "balance_loss_mlp": 1.0800997, + "epoch": 0.08734128510965757, + "flos": 1482303214080.0, + "grad_norm": 0.040702290127782634, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.7977972, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.23730469, + "step": 454, + "time_per_iteration": 4.758902072906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089921, + "balance_loss_mlp": 1.06579328, + "epoch": 0.08753366679492113, + "flos": 1523022289920.0, + "grad_norm": 0.037925831915212815, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.8251788, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.24121094, + "step": 455, + "time_per_iteration": 4.819866180419922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230669, + "balance_loss_mlp": 1.19860172, + "epoch": 0.08772604848018468, + "flos": 721252035072.0, + "grad_norm": 0.12072891758744606, + "language_loss": 0.9005816, + "learning_rate": 0.0009912901304235883, + "loss": 0.91288829, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.32055664, + "step": 456, + "time_per_iteration": 2.928392171859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251391, + "balance_loss_mlp": 1.21851277, + "epoch": 0.08791843016544824, + "flos": 708233310720.0, + "grad_norm": 0.11610109334616998, + "language_loss": 0.85792667, + "learning_rate": 0.000991232138434397, + "loss": 0.8704406, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.32885742, + "step": 457, + "time_per_iteration": 2.868086099624634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268634, + "balance_loss_mlp": 1.23406374, + "epoch": 0.08811081185071182, + "flos": 473043151872.0, + "grad_norm": 0.1267050228562, + "language_loss": 0.92359412, + "learning_rate": 0.000991173955731976, + "loss": 0.93628043, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.34594727, + "step": 458, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225195, + "balance_loss_mlp": 1.19374788, + "epoch": 0.08830319353597538, + "flos": 684980209152.0, + "grad_norm": 0.08225728813848474, + "language_loss": 0.98437196, + "learning_rate": 0.0009911155823389137, + "loss": 0.99662387, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.31445312, + "step": 459, + "time_per_iteration": 3.052828550338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208858, + "balance_loss_mlp": 1.17938948, + "epoch": 0.08849557522123894, + "flos": 573509237760.0, + "grad_norm": 0.06750279925545952, + "language_loss": 0.93789524, + "learning_rate": 0.000991057018277873, + "loss": 0.94998378, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.29467773, + "step": 460, + "time_per_iteration": 2.7062363624572754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175577, + "balance_loss_mlp": 1.14656162, + "epoch": 0.0886879569065025, + "flos": 564567376896.0, + "grad_norm": 0.09934743705595177, + "language_loss": 0.93365753, + "learning_rate": 0.0009909982635715898, + "loss": 0.94541329, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.28979492, + "step": 461, + "time_per_iteration": 2.647963523864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163614, + "balance_loss_mlp": 1.13576651, + "epoch": 0.08888033859176607, + "flos": 563877987840.0, + "grad_norm": 0.09828505426249505, + "language_loss": 0.93045211, + "learning_rate": 0.0009909393182428751, + "loss": 0.94208831, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.27856445, + "step": 462, + "time_per_iteration": 2.6743412017822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163317, + "balance_loss_mlp": 1.13556552, + "epoch": 0.08907272027702963, + "flos": 465761622528.0, + "grad_norm": 0.08889819955039441, + "language_loss": 0.88051188, + "learning_rate": 0.000990880182314614, + "loss": 0.89214504, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.27758789, + "step": 463, + "time_per_iteration": 2.703216314315796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163709, + "balance_loss_mlp": 1.1364336, + "epoch": 0.08926510196229319, + "flos": 681528494592.0, + "grad_norm": 0.07282859671945509, + "language_loss": 0.89247352, + "learning_rate": 0.0009908208558097643, + "loss": 0.90411055, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.27319336, + "step": 464, + "time_per_iteration": 2.9412851333618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011721, + "balance_loss_mlp": 1.14410961, + "epoch": 0.08945748364755675, + "flos": 596692956672.0, + "grad_norm": 0.07278927788912996, + "language_loss": 0.90032935, + "learning_rate": 0.000990761338751359, + "loss": 0.91205037, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.2800293, + "step": 465, + "time_per_iteration": 2.7876837253570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_mlp": 1.0181731, + "epoch": 0.08964986533282032, + "flos": 1585931747328.0, + "grad_norm": 0.02426695301026172, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74698591, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.20800781, + "step": 466, + "time_per_iteration": 5.05983304977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189569, + "balance_loss_mlp": 1.16098237, + "epoch": 0.08984224701808388, + "flos": 533523967488.0, + "grad_norm": 0.07846733746050528, + "language_loss": 0.9248395, + "learning_rate": 0.0009906417330663815, + "loss": 0.93673521, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.28588867, + "step": 467, + "time_per_iteration": 2.696319103240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194386, + "balance_loss_mlp": 1.16539454, + "epoch": 0.09003462870334744, + "flos": 478931222016.0, + "grad_norm": 0.08323950534675657, + "language_loss": 0.88480067, + "learning_rate": 0.0009905816444862442, + "loss": 0.89674455, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.29003906, + "step": 468, + "time_per_iteration": 2.6381607055664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218295, + "balance_loss_mlp": 1.18875456, + "epoch": 0.090227010388611, + "flos": 653625510912.0, + "grad_norm": 0.07740224213463104, + "language_loss": 0.8706888, + "learning_rate": 0.0009905213654454216, + "loss": 0.88287175, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.29516602, + "step": 469, + "time_per_iteration": 2.9251277446746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229446, + "balance_loss_mlp": 1.19940567, + "epoch": 0.09041939207387456, + "flos": 618186608640.0, + "grad_norm": 0.08990381668478556, + "language_loss": 0.94001997, + "learning_rate": 0.0009904608959673158, + "loss": 0.95231444, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.30053711, + "step": 470, + "time_per_iteration": 2.812967538833618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01247261, + "balance_loss_mlp": 1.21679068, + "epoch": 0.09061177375913813, + "flos": 454368724992.0, + "grad_norm": 0.12209234788536222, + "language_loss": 0.92894399, + "learning_rate": 0.000990400236075403, + "loss": 0.94141662, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.3046875, + "step": 471, + "time_per_iteration": 2.5002622604370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205406, + "balance_loss_mlp": 1.17622375, + "epoch": 0.0908041554444017, + "flos": 544247299584.0, + "grad_norm": 0.10180872621251921, + "language_loss": 0.91581351, + "learning_rate": 0.0009903393857932338, + "loss": 0.92786753, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.29150391, + "step": 472, + "time_per_iteration": 2.656669855117798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119747, + "balance_loss_mlp": 1.16866922, + "epoch": 0.09099653712966525, + "flos": 564335009280.0, + "grad_norm": 0.09565392288785843, + "language_loss": 0.88802767, + "learning_rate": 0.0009902783451444317, + "loss": 0.90000236, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.28808594, + "step": 473, + "time_per_iteration": 2.769510269165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118613, + "balance_loss_mlp": 1.16034544, + "epoch": 0.09118891881492881, + "flos": 474540498432.0, + "grad_norm": 0.10259894411844421, + "language_loss": 0.94123209, + "learning_rate": 0.0009902171141526956, + "loss": 0.95309335, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.25769043, + "step": 474, + "time_per_iteration": 2.523611545562744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119701, + "balance_loss_mlp": 1.17120147, + "epoch": 0.09138130050019239, + "flos": 545860643328.0, + "grad_norm": 0.11667434950480311, + "language_loss": 0.82319391, + "learning_rate": 0.000990155692841797, + "loss": 0.83516395, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.25817871, + "step": 475, + "time_per_iteration": 2.9675121307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227501, + "balance_loss_mlp": 1.20134616, + "epoch": 0.09157368218545595, + "flos": 732711744000.0, + "grad_norm": 0.09682112540143008, + "language_loss": 0.93477046, + "learning_rate": 0.0009900940812355818, + "loss": 0.94704551, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.26147461, + "step": 476, + "time_per_iteration": 2.924874782562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242063, + "balance_loss_mlp": 1.21519351, + "epoch": 0.0917660638707195, + "flos": 610981802496.0, + "grad_norm": 0.10139353171378648, + "language_loss": 0.88050354, + "learning_rate": 0.00099003227935797, + "loss": 0.89292419, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.26879883, + "step": 477, + "time_per_iteration": 2.7283573150634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261302, + "balance_loss_mlp": 1.23314476, + "epoch": 0.09195844555598306, + "flos": 655851893760.0, + "grad_norm": 0.08348382552066277, + "language_loss": 0.91095632, + "learning_rate": 0.000989970287232955, + "loss": 0.92356932, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.28149414, + "step": 478, + "time_per_iteration": 2.8266103267669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241973, + "balance_loss_mlp": 1.21583056, + "epoch": 0.09215082724124664, + "flos": 476578930176.0, + "grad_norm": 0.10737558840696987, + "language_loss": 0.89902192, + "learning_rate": 0.0009899081048846043, + "loss": 0.91144162, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.26135254, + "step": 479, + "time_per_iteration": 2.6420280933380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291965, + "balance_loss_mlp": 1.26427281, + "epoch": 0.0923432089265102, + "flos": 524305322496.0, + "grad_norm": 0.10012000168567356, + "language_loss": 0.93502498, + "learning_rate": 0.0009898457323370593, + "loss": 0.94794464, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.27697754, + "step": 480, + "time_per_iteration": 2.6065309047698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246143, + "balance_loss_mlp": 1.21961892, + "epoch": 0.09253559061177376, + "flos": 545569178112.0, + "grad_norm": 0.0993880337212747, + "language_loss": 0.92708224, + "learning_rate": 0.000989783169614535, + "loss": 0.93954372, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.26525879, + "step": 481, + "time_per_iteration": 2.7099456787109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079427, + "balance_loss_mlp": 1.05930424, + "epoch": 0.09272797229703732, + "flos": 1538042370048.0, + "grad_norm": 0.03505173716607146, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.79832184, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.20117188, + "step": 482, + "time_per_iteration": 4.890375852584839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01231966, + "balance_loss_mlp": 1.20573974, + "epoch": 0.09292035398230089, + "flos": 689813273088.0, + "grad_norm": 0.10137363964482546, + "language_loss": 0.90139151, + "learning_rate": 0.000989657473741779, + "loss": 0.91371119, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.2623291, + "step": 483, + "time_per_iteration": 2.9370570182800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207368, + "balance_loss_mlp": 1.18004489, + "epoch": 0.09311273566756445, + "flos": 509749604352.0, + "grad_norm": 0.08498377120371232, + "language_loss": 0.9143101, + "learning_rate": 0.0009895943406403465, + "loss": 0.92638373, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.2734375, + "step": 484, + "time_per_iteration": 2.7508950233459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207641, + "balance_loss_mlp": 1.17798209, + "epoch": 0.09330511735282801, + "flos": 659404924416.0, + "grad_norm": 0.09176142665566275, + "language_loss": 0.84377563, + "learning_rate": 0.0009895310174615338, + "loss": 0.85585213, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.29638672, + "step": 485, + "time_per_iteration": 2.785452365875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111377, + "balance_loss_mlp": 1.09211314, + "epoch": 0.09349749903809157, + "flos": 1452845984256.0, + "grad_norm": 0.060723434374539316, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76829892, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.19238281, + "step": 486, + "time_per_iteration": 4.6911780834198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119428, + "balance_loss_mlp": 1.16636121, + "epoch": 0.09368988072335514, + "flos": 520870860288.0, + "grad_norm": 0.10396612544825783, + "language_loss": 0.89653724, + "learning_rate": 0.0009894038009701782, + "loss": 0.90848005, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.27954102, + "step": 487, + "time_per_iteration": 2.6375234127044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240132, + "balance_loss_mlp": 1.20847011, + "epoch": 0.0938822624086187, + "flos": 497751381504.0, + "grad_norm": 0.09761423787564506, + "language_loss": 0.88893723, + "learning_rate": 0.0009893399077070253, + "loss": 0.90133858, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31640625, + "step": 488, + "time_per_iteration": 2.63673734664917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217529, + "balance_loss_mlp": 1.18844151, + "epoch": 0.09407464409388226, + "flos": 533202766848.0, + "grad_norm": 0.08578938939363263, + "language_loss": 0.87286389, + "learning_rate": 0.0009892758244652718, + "loss": 0.88503921, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29077148, + "step": 489, + "time_per_iteration": 2.6579813957214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226074, + "balance_loss_mlp": 1.19698668, + "epoch": 0.09426702577914582, + "flos": 586006700544.0, + "grad_norm": 0.10664482488995004, + "language_loss": 0.91801828, + "learning_rate": 0.0009892115512697968, + "loss": 0.93027902, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.29101562, + "step": 490, + "time_per_iteration": 2.744812250137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208984, + "balance_loss_mlp": 1.18106508, + "epoch": 0.0944594074644094, + "flos": 503357524992.0, + "grad_norm": 0.07150484911777356, + "language_loss": 0.94226933, + "learning_rate": 0.0009891470881455537, + "loss": 0.95435917, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.27905273, + "step": 491, + "time_per_iteration": 2.7888436317443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184692, + "balance_loss_mlp": 1.15854979, + "epoch": 0.09465178914967295, + "flos": 571021125120.0, + "grad_norm": 0.08034794474061628, + "language_loss": 0.91272295, + "learning_rate": 0.0009890824351175692, + "loss": 0.92456985, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.26184082, + "step": 492, + "time_per_iteration": 2.6893324851989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168375, + "balance_loss_mlp": 1.1430074, + "epoch": 0.09484417083493651, + "flos": 549361916928.0, + "grad_norm": 0.11413207975143042, + "language_loss": 0.96479064, + "learning_rate": 0.0009890175922109435, + "loss": 0.9764744, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.25378418, + "step": 493, + "time_per_iteration": 2.678849935531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184878, + "balance_loss_mlp": 1.15874791, + "epoch": 0.09503655252020007, + "flos": 823894944768.0, + "grad_norm": 0.08018179898504754, + "language_loss": 0.9392823, + "learning_rate": 0.0009889525594508513, + "loss": 0.95113099, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.26147461, + "step": 494, + "time_per_iteration": 3.067603349685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118757, + "balance_loss_mlp": 1.16171312, + "epoch": 0.09522893420546363, + "flos": 404621153280.0, + "grad_norm": 0.06605208103828443, + "language_loss": 0.88701022, + "learning_rate": 0.0009888873368625404, + "loss": 0.89888591, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.25891113, + "step": 495, + "time_per_iteration": 2.513042688369751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208336, + "balance_loss_mlp": 1.18301558, + "epoch": 0.0954213158907272, + "flos": 691016583168.0, + "grad_norm": 0.21045205282495727, + "language_loss": 0.923554, + "learning_rate": 0.0009888219244713326, + "loss": 0.93563735, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.2532959, + "step": 496, + "time_per_iteration": 2.867083787918091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121286, + "balance_loss_mlp": 1.18638349, + "epoch": 0.09561369757599077, + "flos": 519005325312.0, + "grad_norm": 0.11531388313037762, + "language_loss": 0.9129262, + "learning_rate": 0.0009887563223026229, + "loss": 0.92505479, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.26501465, + "step": 497, + "time_per_iteration": 2.708878993988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01251726, + "balance_loss_mlp": 1.23503661, + "epoch": 0.09580607926125433, + "flos": 1385614812672.0, + "grad_norm": 0.1018924396807409, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80319893, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.16699219, + "step": 498, + "time_per_iteration": 4.9335105419158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203662, + "balance_loss_mlp": 1.1767087, + "epoch": 0.09599846094651789, + "flos": 717436901376.0, + "grad_norm": 0.15301104951645897, + "language_loss": 0.9155978, + "learning_rate": 0.0009886245487346482, + "loss": 0.92763442, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.2701416, + "step": 499, + "time_per_iteration": 3.048356771469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012374, + "balance_loss_mlp": 1.20936203, + "epoch": 0.09619084263178146, + "flos": 386038130688.0, + "grad_norm": 0.11445293306924414, + "language_loss": 0.93613195, + "learning_rate": 0.0009885583773865422, + "loss": 0.948506, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.28076172, + "step": 500, + "time_per_iteration": 2.4812135696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124857, + "balance_loss_mlp": 1.21948266, + "epoch": 0.09638322431704502, + "flos": 534129292800.0, + "grad_norm": 0.08673144300895683, + "language_loss": 0.91201293, + "learning_rate": 0.0009884920163632524, + "loss": 0.92449856, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.2911377, + "step": 501, + "time_per_iteration": 2.6971659660339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267642, + "balance_loss_mlp": 1.23836422, + "epoch": 0.09657560600230858, + "flos": 500671922688.0, + "grad_norm": 0.09557615258578338, + "language_loss": 0.9327184, + "learning_rate": 0.000988425465690543, + "loss": 0.94539481, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.29296875, + "step": 502, + "time_per_iteration": 2.6156561374664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0125655, + "balance_loss_mlp": 1.22724855, + "epoch": 0.09676798768757214, + "flos": 529261724160.0, + "grad_norm": 0.09431767215346384, + "language_loss": 0.90255487, + "learning_rate": 0.0009883587253942505, + "loss": 0.91512042, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.29284668, + "step": 503, + "time_per_iteration": 2.8239471912384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01284099, + "balance_loss_mlp": 1.25420117, + "epoch": 0.09696036937283571, + "flos": 463614534144.0, + "grad_norm": 0.11394137891765209, + "language_loss": 0.96597123, + "learning_rate": 0.0009882917955002862, + "loss": 0.97881228, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.29907227, + "step": 504, + "time_per_iteration": 2.603328227996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229149, + "balance_loss_mlp": 1.2021842, + "epoch": 0.09715275105809927, + "flos": 534974326272.0, + "grad_norm": 0.09281538791599028, + "language_loss": 0.89550316, + "learning_rate": 0.0009882246760346343, + "loss": 0.90779471, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.26977539, + "step": 505, + "time_per_iteration": 2.681687831878662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229765, + "balance_loss_mlp": 1.20144057, + "epoch": 0.09734513274336283, + "flos": 454946886144.0, + "grad_norm": 0.10637320281066408, + "language_loss": 0.9312228, + "learning_rate": 0.0009881573670233533, + "loss": 0.94352043, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.28295898, + "step": 506, + "time_per_iteration": 2.5317869186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210552, + "balance_loss_mlp": 1.18480229, + "epoch": 0.09753751442862639, + "flos": 508805826048.0, + "grad_norm": 0.08668641437707587, + "language_loss": 0.88418829, + "learning_rate": 0.0009880898684925747, + "loss": 0.89629376, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.25769043, + "step": 507, + "time_per_iteration": 2.7037086486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171606, + "balance_loss_mlp": 1.14662004, + "epoch": 0.09772989611388996, + "flos": 484273064448.0, + "grad_norm": 0.09301682260046856, + "language_loss": 0.8754462, + "learning_rate": 0.0009880221804685037, + "loss": 0.88716233, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.24987793, + "step": 508, + "time_per_iteration": 2.5904412269592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132812, + "balance_loss_mlp": 1.11431122, + "epoch": 0.09792227779915352, + "flos": 1566106140672.0, + "grad_norm": 0.05770369236985839, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80477232, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.18457031, + "step": 509, + "time_per_iteration": 4.754728317260742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155131, + "balance_loss_mlp": 1.12993002, + "epoch": 0.09811465948441708, + "flos": 587805424128.0, + "grad_norm": 0.08546011105886044, + "language_loss": 0.93283963, + "learning_rate": 0.0009878862360456733, + "loss": 0.94439089, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.25219727, + "step": 510, + "time_per_iteration": 2.7473011016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139729, + "balance_loss_mlp": 1.11480212, + "epoch": 0.09830704116968064, + "flos": 613000410624.0, + "grad_norm": 0.09364527364696289, + "language_loss": 0.86814249, + "learning_rate": 0.0009878179796996922, + "loss": 0.87953973, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.24926758, + "step": 511, + "time_per_iteration": 2.74253249168396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157281, + "balance_loss_mlp": 1.13087618, + "epoch": 0.09849942285494422, + "flos": 538808910336.0, + "grad_norm": 0.0728025857811697, + "language_loss": 0.90271652, + "learning_rate": 0.0009877495339659754, + "loss": 0.91428936, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.26428223, + "step": 512, + "time_per_iteration": 2.7383904457092285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011795, + "balance_loss_mlp": 1.15373945, + "epoch": 0.09869180454020778, + "flos": 620474660352.0, + "grad_norm": 0.08851969035528326, + "language_loss": 0.84944135, + "learning_rate": 0.000987680898871096, + "loss": 0.86123633, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.2578125, + "step": 513, + "time_per_iteration": 2.7576277256011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213945, + "balance_loss_mlp": 1.18686032, + "epoch": 0.09888418622547133, + "flos": 811711342080.0, + "grad_norm": 0.10650793826837307, + "language_loss": 0.85207206, + "learning_rate": 0.0009876120744417, + "loss": 0.8642115, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.27075195, + "step": 514, + "time_per_iteration": 2.9868528842926025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205226, + "balance_loss_mlp": 1.17891693, + "epoch": 0.0990765679107349, + "flos": 535809447936.0, + "grad_norm": 0.12423818842648264, + "language_loss": 0.94048339, + "learning_rate": 0.0009875430607045078, + "loss": 0.95253563, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.2635498, + "step": 515, + "time_per_iteration": 2.6809887886047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217575, + "balance_loss_mlp": 1.19226718, + "epoch": 0.09926894959599845, + "flos": 587879576064.0, + "grad_norm": 0.09121928261100491, + "language_loss": 0.90633368, + "learning_rate": 0.000987473857686313, + "loss": 0.91850942, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.2532959, + "step": 516, + "time_per_iteration": 2.821868896484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01273678, + "balance_loss_mlp": 1.24556851, + "epoch": 0.09946133128126203, + "flos": 641234506752.0, + "grad_norm": 0.10235865570139392, + "language_loss": 0.92397732, + "learning_rate": 0.0009874044654139824, + "loss": 0.93671417, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.28125, + "step": 517, + "time_per_iteration": 2.754556894302368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269361, + "balance_loss_mlp": 1.24070311, + "epoch": 0.09965371296652559, + "flos": 465781446144.0, + "grad_norm": 0.1033638080855083, + "language_loss": 0.91346741, + "learning_rate": 0.0009873348839144563, + "loss": 0.92616105, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.28662109, + "step": 518, + "time_per_iteration": 2.5521421432495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223264, + "balance_loss_mlp": 1.19750261, + "epoch": 0.09984609465178915, + "flos": 483603499008.0, + "grad_norm": 0.08349046242237956, + "language_loss": 0.9484781, + "learning_rate": 0.000987265113214749, + "loss": 0.96071064, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.25793457, + "step": 519, + "time_per_iteration": 2.5728440284729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209741, + "balance_loss_mlp": 1.18294215, + "epoch": 0.1000384763370527, + "flos": 569029681152.0, + "grad_norm": 0.0925674463481217, + "language_loss": 0.93808675, + "learning_rate": 0.0009871951533419476, + "loss": 0.95018411, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.26794434, + "step": 520, + "time_per_iteration": 2.720158576965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173015, + "balance_loss_mlp": 1.14725351, + "epoch": 0.10023085802231628, + "flos": 545796403200.0, + "grad_norm": 0.08576102326010304, + "language_loss": 0.87117791, + "learning_rate": 0.0009871250043232132, + "loss": 0.88290811, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.25769043, + "step": 521, + "time_per_iteration": 2.7765281200408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167625, + "balance_loss_mlp": 1.14231658, + "epoch": 0.10042323970757984, + "flos": 503454071808.0, + "grad_norm": 0.08176746103179605, + "language_loss": 0.85016751, + "learning_rate": 0.0009870546661857797, + "loss": 0.86184376, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.25317383, + "step": 522, + "time_per_iteration": 2.621741771697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192442, + "balance_loss_mlp": 1.16581106, + "epoch": 0.1006156213928434, + "flos": 770411533824.0, + "grad_norm": 0.1034937566099096, + "language_loss": 0.93671012, + "learning_rate": 0.0009869841389569553, + "loss": 0.94863456, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.26647949, + "step": 523, + "time_per_iteration": 2.9877190589904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176234, + "balance_loss_mlp": 1.15106893, + "epoch": 0.10080800307810696, + "flos": 490030083072.0, + "grad_norm": 0.09839184495226623, + "language_loss": 0.87745041, + "learning_rate": 0.0009869134226641206, + "loss": 0.88921273, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.25170898, + "step": 524, + "time_per_iteration": 2.5881335735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182922, + "balance_loss_mlp": 1.15635061, + "epoch": 0.10100038476337053, + "flos": 454724430336.0, + "grad_norm": 0.08405321822424026, + "language_loss": 0.86857122, + "learning_rate": 0.0009868425173347303, + "loss": 0.88040042, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.26599121, + "step": 525, + "time_per_iteration": 2.66532301902771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171089, + "balance_loss_mlp": 1.14556646, + "epoch": 0.10119276644863409, + "flos": 556438242816.0, + "grad_norm": 0.08405786654151125, + "language_loss": 0.94851571, + "learning_rate": 0.0009867714229963125, + "loss": 0.96022666, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.25549316, + "step": 526, + "time_per_iteration": 2.8129477500915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180551, + "balance_loss_mlp": 1.15515995, + "epoch": 0.10138514813389765, + "flos": 516235659264.0, + "grad_norm": 0.0887042459069511, + "language_loss": 0.92144597, + "learning_rate": 0.000986700139676468, + "loss": 0.93325144, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.25402832, + "step": 527, + "time_per_iteration": 2.5864803791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221342, + "balance_loss_mlp": 1.19498479, + "epoch": 0.10157752981916121, + "flos": 500570606592.0, + "grad_norm": 0.0908626798732068, + "language_loss": 0.89802891, + "learning_rate": 0.0009866286674028717, + "loss": 0.91024232, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.26379395, + "step": 528, + "time_per_iteration": 2.6321539878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195715, + "balance_loss_mlp": 1.1701684, + "epoch": 0.10176991150442478, + "flos": 656773277184.0, + "grad_norm": 0.10105960014250041, + "language_loss": 0.86296791, + "learning_rate": 0.0009865570062032717, + "loss": 0.87492502, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.25561523, + "step": 529, + "time_per_iteration": 2.9451780319213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180062, + "balance_loss_mlp": 1.15431321, + "epoch": 0.10196229318968834, + "flos": 573259617792.0, + "grad_norm": 0.07153867300670864, + "language_loss": 0.9169668, + "learning_rate": 0.0009864851561054893, + "loss": 0.92876744, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.25756836, + "step": 530, + "time_per_iteration": 2.829380512237549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138967, + "balance_loss_mlp": 1.11554205, + "epoch": 0.1021546748749519, + "flos": 518207279616.0, + "grad_norm": 0.07949334936814403, + "language_loss": 0.90603149, + "learning_rate": 0.0009864131171374191, + "loss": 0.9174211, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.23413086, + "step": 531, + "time_per_iteration": 2.7103002071380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144785, + "balance_loss_mlp": 1.12042999, + "epoch": 0.10234705656021546, + "flos": 609766009344.0, + "grad_norm": 0.09480674153197077, + "language_loss": 0.89899409, + "learning_rate": 0.0009863408893270292, + "loss": 0.91044188, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.24353027, + "step": 532, + "time_per_iteration": 2.800015926361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135275, + "balance_loss_mlp": 1.11101604, + "epoch": 0.10253943824547904, + "flos": 601760586240.0, + "grad_norm": 0.12452848702407365, + "language_loss": 0.84814823, + "learning_rate": 0.0009862684727023605, + "loss": 0.85950094, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.24243164, + "step": 533, + "time_per_iteration": 2.733250856399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142831, + "balance_loss_mlp": 1.11813033, + "epoch": 0.1027318199307426, + "flos": 662948043264.0, + "grad_norm": 0.10251703935298907, + "language_loss": 0.88274956, + "learning_rate": 0.0009861958672915283, + "loss": 0.89417779, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.24707031, + "step": 534, + "time_per_iteration": 2.8380610942840576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151783, + "balance_loss_mlp": 1.12847757, + "epoch": 0.10292420161600616, + "flos": 683275461120.0, + "grad_norm": 0.08316309975829886, + "language_loss": 0.88756025, + "learning_rate": 0.0009861230731227201, + "loss": 0.89907813, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.23291016, + "step": 535, + "time_per_iteration": 2.871997594833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188729, + "balance_loss_mlp": 1.16410041, + "epoch": 0.10311658330126972, + "flos": 490287043584.0, + "grad_norm": 0.08198011669981227, + "language_loss": 0.89923763, + "learning_rate": 0.0009860500902241973, + "loss": 0.91112483, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.24633789, + "step": 536, + "time_per_iteration": 2.623779058456421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200536, + "balance_loss_mlp": 1.17560923, + "epoch": 0.10330896498653329, + "flos": 431743343616.0, + "grad_norm": 0.07805911222058415, + "language_loss": 0.94478881, + "learning_rate": 0.0009859769186242942, + "loss": 0.95679414, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.24914551, + "step": 537, + "time_per_iteration": 2.580596923828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237375, + "balance_loss_mlp": 1.21290088, + "epoch": 0.10350134667179685, + "flos": 549591713280.0, + "grad_norm": 0.07373890024349967, + "language_loss": 0.87774181, + "learning_rate": 0.0009859035583514187, + "loss": 0.89011556, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.24450684, + "step": 538, + "time_per_iteration": 2.6570377349853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01278522, + "balance_loss_mlp": 1.25283265, + "epoch": 0.10369372835706041, + "flos": 640626610176.0, + "grad_norm": 0.09282522264822365, + "language_loss": 0.89254487, + "learning_rate": 0.0009858300094340517, + "loss": 0.90533006, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.25720215, + "step": 539, + "time_per_iteration": 2.787065267562866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129717, + "balance_loss_mlp": 1.271981, + "epoch": 0.10388611004232397, + "flos": 521752969728.0, + "grad_norm": 0.12009350418361847, + "language_loss": 0.84273541, + "learning_rate": 0.0009857562719007473, + "loss": 0.85570705, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.2520752, + "step": 540, + "time_per_iteration": 2.60508394241333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269758, + "balance_loss_mlp": 1.24520063, + "epoch": 0.10407849172758753, + "flos": 702436644864.0, + "grad_norm": 0.09915993306854447, + "language_loss": 0.86265039, + "learning_rate": 0.0009856823457801331, + "loss": 0.87534791, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.24560547, + "step": 541, + "time_per_iteration": 2.916395664215088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200462, + "balance_loss_mlp": 1.17673898, + "epoch": 0.1042708734128511, + "flos": 502910415360.0, + "grad_norm": 0.08980852435022621, + "language_loss": 0.93430036, + "learning_rate": 0.00098560823110091, + "loss": 0.94630498, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.23718262, + "step": 542, + "time_per_iteration": 2.6473944187164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011666, + "balance_loss_mlp": 1.14424849, + "epoch": 0.10446325509811466, + "flos": 485592371712.0, + "grad_norm": 0.09857064774686858, + "language_loss": 0.94166034, + "learning_rate": 0.000985533927891851, + "loss": 0.95332634, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.22338867, + "step": 543, + "time_per_iteration": 2.7833001613616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152313, + "balance_loss_mlp": 1.13023496, + "epoch": 0.10465563678337822, + "flos": 568634328576.0, + "grad_norm": 0.11299567756475092, + "language_loss": 0.91803026, + "learning_rate": 0.0009854594361818044, + "loss": 0.92955339, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.22070312, + "step": 544, + "time_per_iteration": 2.7342488765716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145926, + "balance_loss_mlp": 1.12322879, + "epoch": 0.10484801846864178, + "flos": 626093286912.0, + "grad_norm": 0.10706082764174026, + "language_loss": 0.90779245, + "learning_rate": 0.0009853847559996897, + "loss": 0.91925174, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.22680664, + "step": 545, + "time_per_iteration": 2.7671496868133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115478, + "balance_loss_mlp": 1.13199878, + "epoch": 0.10504040015390535, + "flos": 743412681216.0, + "grad_norm": 0.09298705322285353, + "language_loss": 0.90420544, + "learning_rate": 0.0009853098873745, + "loss": 0.91575325, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.2277832, + "step": 546, + "time_per_iteration": 3.0312061309814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114995, + "balance_loss_mlp": 1.12715745, + "epoch": 0.10523278183916891, + "flos": 586673694720.0, + "grad_norm": 0.08666617811450783, + "language_loss": 0.89437926, + "learning_rate": 0.0009852348303353027, + "loss": 0.90587872, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.22802734, + "step": 547, + "time_per_iteration": 2.8053338527679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175894, + "balance_loss_mlp": 1.15260065, + "epoch": 0.10542516352443247, + "flos": 869644574208.0, + "grad_norm": 0.07202142444648872, + "language_loss": 0.8282218, + "learning_rate": 0.000985159584911237, + "loss": 0.83998078, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.23291016, + "step": 548, + "time_per_iteration": 3.168396472930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200066, + "balance_loss_mlp": 1.17569995, + "epoch": 0.10561754520969603, + "flos": 505428263424.0, + "grad_norm": 0.0989690478360349, + "language_loss": 0.89268672, + "learning_rate": 0.0009850841511315162, + "loss": 0.9046874, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.24365234, + "step": 549, + "time_per_iteration": 2.6511220932006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205335, + "balance_loss_mlp": 1.18058681, + "epoch": 0.1058099268949596, + "flos": 559981361664.0, + "grad_norm": 0.10906170470493136, + "language_loss": 0.90274942, + "learning_rate": 0.0009850085290254256, + "loss": 0.91480273, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.24755859, + "step": 550, + "time_per_iteration": 2.8123652935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166073, + "balance_loss_mlp": 1.14193285, + "epoch": 0.10600230858022316, + "flos": 562049528832.0, + "grad_norm": 0.06887070936512274, + "language_loss": 0.8779422, + "learning_rate": 0.0009849327186223246, + "loss": 0.88960296, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.24121094, + "step": 551, + "time_per_iteration": 2.780959129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144115, + "balance_loss_mlp": 1.12010658, + "epoch": 0.10619469026548672, + "flos": 494326831104.0, + "grad_norm": 0.1035499947998288, + "language_loss": 0.94864386, + "learning_rate": 0.000984856719951646, + "loss": 0.96008497, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.23986816, + "step": 552, + "time_per_iteration": 2.599581718444824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135304, + "balance_loss_mlp": 1.1112473, + "epoch": 0.10638707195075028, + "flos": 676166828544.0, + "grad_norm": 0.08131430219430819, + "language_loss": 0.91351348, + "learning_rate": 0.0009847805330428943, + "loss": 0.92486656, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.24035645, + "step": 553, + "time_per_iteration": 2.9599480628967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126363, + "balance_loss_mlp": 1.1017344, + "epoch": 0.10657945363601386, + "flos": 488055891456.0, + "grad_norm": 0.10883623187773357, + "language_loss": 0.92631853, + "learning_rate": 0.0009847041579256481, + "loss": 0.93758214, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.24633789, + "step": 554, + "time_per_iteration": 2.592348575592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139312, + "balance_loss_mlp": 1.11518431, + "epoch": 0.10677183532127742, + "flos": 482958526464.0, + "grad_norm": 0.08685206815428315, + "language_loss": 0.94236493, + "learning_rate": 0.0009846275946295592, + "loss": 0.95375812, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.2409668, + "step": 555, + "time_per_iteration": 2.6748178005218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157684, + "balance_loss_mlp": 1.13367498, + "epoch": 0.10696421700654098, + "flos": 656249444352.0, + "grad_norm": 0.1423144419608042, + "language_loss": 0.86826319, + "learning_rate": 0.0009845508431843518, + "loss": 0.87984002, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.23974609, + "step": 556, + "time_per_iteration": 3.0652637481689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188505, + "balance_loss_mlp": 1.16398418, + "epoch": 0.10715659869180454, + "flos": 567744878592.0, + "grad_norm": 0.08544226719489541, + "language_loss": 0.87931871, + "learning_rate": 0.0009844739036198233, + "loss": 0.89120376, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.24523926, + "step": 557, + "time_per_iteration": 2.667473793029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210589, + "balance_loss_mlp": 1.18594849, + "epoch": 0.10734898037706811, + "flos": 540694268928.0, + "grad_norm": 0.07677269921737997, + "language_loss": 0.9440788, + "learning_rate": 0.0009843967759658448, + "loss": 0.95618474, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.24658203, + "step": 558, + "time_per_iteration": 2.7628064155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132066, + "balance_loss_mlp": 1.11194348, + "epoch": 0.10754136206233167, + "flos": 1476640171008.0, + "grad_norm": 0.0590422913979422, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.73899817, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.20117188, + "step": 559, + "time_per_iteration": 4.902129888534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01241703, + "balance_loss_mlp": 1.21570349, + "epoch": 0.10773374374759523, + "flos": 512405844480.0, + "grad_norm": 0.0867010736609256, + "language_loss": 0.9488945, + "learning_rate": 0.000984241956509384, + "loss": 0.96131158, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.26025391, + "step": 560, + "time_per_iteration": 2.6891891956329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208783, + "balance_loss_mlp": 1.18289042, + "epoch": 0.10792612543285879, + "flos": 496503654912.0, + "grad_norm": 0.08963888455934524, + "language_loss": 0.90658677, + "learning_rate": 0.0009841642647670078, + "loss": 0.91867459, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.25927734, + "step": 561, + "time_per_iteration": 2.563408613204956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198224, + "balance_loss_mlp": 1.17229605, + "epoch": 0.10811850711812235, + "flos": 735471498240.0, + "grad_norm": 0.08487676980325562, + "language_loss": 0.85033154, + "learning_rate": 0.0009840863850553944, + "loss": 0.86231375, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.25964355, + "step": 562, + "time_per_iteration": 2.9805734157562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183286, + "balance_loss_mlp": 1.157763, + "epoch": 0.10831088880338592, + "flos": 611540140032.0, + "grad_norm": 0.08249773787970602, + "language_loss": 0.90893888, + "learning_rate": 0.0009840083174047782, + "loss": 0.92077172, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.25537109, + "step": 563, + "time_per_iteration": 2.7391836643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194315, + "balance_loss_mlp": 1.16986513, + "epoch": 0.10850327048864948, + "flos": 556317103104.0, + "grad_norm": 0.07051664629026161, + "language_loss": 0.85589021, + "learning_rate": 0.0009839300618454685, + "loss": 0.86783338, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.24438477, + "step": 564, + "time_per_iteration": 2.89290452003479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194246, + "balance_loss_mlp": 1.16989148, + "epoch": 0.10869565217391304, + "flos": 603208373760.0, + "grad_norm": 0.07367825547097939, + "language_loss": 0.91287452, + "learning_rate": 0.0009838516184078466, + "loss": 0.92481697, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.24353027, + "step": 565, + "time_per_iteration": 2.8416025638580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201114, + "balance_loss_mlp": 1.17573452, + "epoch": 0.1088880338591766, + "flos": 526178198016.0, + "grad_norm": 0.11472314835583913, + "language_loss": 0.88207066, + "learning_rate": 0.0009837729871223669, + "loss": 0.89408183, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.25402832, + "step": 566, + "time_per_iteration": 2.6492197513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249487, + "balance_loss_mlp": 1.22309399, + "epoch": 0.10908041554444017, + "flos": 620272028160.0, + "grad_norm": 0.07200956845133732, + "language_loss": 0.88285792, + "learning_rate": 0.0009836941680195568, + "loss": 0.89535284, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.2644043, + "step": 567, + "time_per_iteration": 2.794311285018921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124424, + "balance_loss_mlp": 1.21801353, + "epoch": 0.10927279722970373, + "flos": 898125719040.0, + "grad_norm": 0.08672877457635139, + "language_loss": 0.83671671, + "learning_rate": 0.0009836151611300166, + "loss": 0.84915912, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.26245117, + "step": 568, + "time_per_iteration": 3.2202959060668945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232018, + "balance_loss_mlp": 1.2069366, + "epoch": 0.10946517891496729, + "flos": 528666310656.0, + "grad_norm": 0.0737206182188589, + "language_loss": 0.9499715, + "learning_rate": 0.0009835359664844194, + "loss": 0.96229166, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.25097656, + "step": 569, + "time_per_iteration": 2.6723880767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115473, + "balance_loss_mlp": 1.09935594, + "epoch": 0.10965756060023085, + "flos": 1560751815168.0, + "grad_norm": 0.05305645754414589, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82152283, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.16113281, + "step": 570, + "time_per_iteration": 4.934283494949341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262159, + "balance_loss_mlp": 1.23583817, + "epoch": 0.10984994228549443, + "flos": 513075409920.0, + "grad_norm": 0.0759630537733653, + "language_loss": 0.91932368, + "learning_rate": 0.0009833770140481118, + "loss": 0.93194532, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.2635498, + "step": 571, + "time_per_iteration": 2.6325361728668213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240536, + "balance_loss_mlp": 1.21385729, + "epoch": 0.11004232397075799, + "flos": 954705139200.0, + "grad_norm": 0.07085220990305834, + "language_loss": 0.82309085, + "learning_rate": 0.000983297256319112, + "loss": 0.83549619, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.26733398, + "step": 572, + "time_per_iteration": 3.230297088623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227341, + "balance_loss_mlp": 1.20004177, + "epoch": 0.11023470565602154, + "flos": 488181800448.0, + "grad_norm": 0.0905445578460947, + "language_loss": 0.86770016, + "learning_rate": 0.000983217310957477, + "loss": 0.87997353, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.27319336, + "step": 573, + "time_per_iteration": 2.8283607959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230404, + "balance_loss_mlp": 1.20267606, + "epoch": 0.1104270873412851, + "flos": 655814817792.0, + "grad_norm": 0.08397098324277796, + "language_loss": 0.89933473, + "learning_rate": 0.000983137177994244, + "loss": 0.91163886, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.27734375, + "step": 574, + "time_per_iteration": 2.945197820663452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184809, + "balance_loss_mlp": 1.15805852, + "epoch": 0.11061946902654868, + "flos": 723426287616.0, + "grad_norm": 0.08995501683398337, + "language_loss": 0.85942268, + "learning_rate": 0.0009830568574605235, + "loss": 0.87127078, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.26782227, + "step": 575, + "time_per_iteration": 2.9714908599853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173303, + "balance_loss_mlp": 1.14733911, + "epoch": 0.11081185071181224, + "flos": 835463310336.0, + "grad_norm": 0.11617237422906017, + "language_loss": 0.87585467, + "learning_rate": 0.0009829763493874992, + "loss": 0.88758773, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.2598877, + "step": 576, + "time_per_iteration": 3.0522892475128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185519, + "balance_loss_mlp": 1.15929341, + "epoch": 0.1110042323970758, + "flos": 609076620288.0, + "grad_norm": 0.07800734946110352, + "language_loss": 0.92923808, + "learning_rate": 0.0009828956538064264, + "loss": 0.94109321, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.26245117, + "step": 577, + "time_per_iteration": 2.8397951126098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198525, + "balance_loss_mlp": 1.17312193, + "epoch": 0.11119661408233936, + "flos": 595922075136.0, + "grad_norm": 0.07768178407950788, + "language_loss": 0.90871215, + "learning_rate": 0.0009828147707486344, + "loss": 0.92069739, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.25427246, + "step": 578, + "time_per_iteration": 2.714322805404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120727, + "balance_loss_mlp": 1.18262911, + "epoch": 0.11138899576760293, + "flos": 555835488768.0, + "grad_norm": 0.08360568840749934, + "language_loss": 0.86554426, + "learning_rate": 0.0009827337002455245, + "loss": 0.877617, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.24645996, + "step": 579, + "time_per_iteration": 2.742311477661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195367, + "balance_loss_mlp": 1.17049956, + "epoch": 0.11158137745286649, + "flos": 689746461696.0, + "grad_norm": 0.07475116375685303, + "language_loss": 0.87853694, + "learning_rate": 0.0009826524423285712, + "loss": 0.89049065, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.24865723, + "step": 580, + "time_per_iteration": 3.014310121536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212273, + "balance_loss_mlp": 1.18770432, + "epoch": 0.11177375913813005, + "flos": 763011436032.0, + "grad_norm": 0.09493717034802315, + "language_loss": 0.88884461, + "learning_rate": 0.0009825709970293218, + "loss": 0.90096736, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2454834, + "step": 581, + "time_per_iteration": 3.004209518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215399, + "balance_loss_mlp": 1.19164097, + "epoch": 0.11196614082339361, + "flos": 806574329856.0, + "grad_norm": 0.0873103144771369, + "language_loss": 0.95079505, + "learning_rate": 0.0009824893643793956, + "loss": 0.96294904, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.23754883, + "step": 582, + "time_per_iteration": 3.0893442630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220934, + "balance_loss_mlp": 1.1956501, + "epoch": 0.11215852250865718, + "flos": 558624978432.0, + "grad_norm": 0.08836320076119632, + "language_loss": 0.87841964, + "learning_rate": 0.0009824075444104857, + "loss": 0.89062899, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.25280762, + "step": 583, + "time_per_iteration": 2.7537503242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239656, + "balance_loss_mlp": 1.21521807, + "epoch": 0.11235090419392074, + "flos": 513572078592.0, + "grad_norm": 0.16884309783149784, + "language_loss": 0.93345737, + "learning_rate": 0.000982325537154357, + "loss": 0.94585395, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.24450684, + "step": 584, + "time_per_iteration": 2.59409499168396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211835, + "balance_loss_mlp": 1.18743277, + "epoch": 0.1125432858791843, + "flos": 491453277696.0, + "grad_norm": 0.08768097982415915, + "language_loss": 0.93578511, + "learning_rate": 0.0009822433426428484, + "loss": 0.94790351, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.24401855, + "step": 585, + "time_per_iteration": 2.581516742706299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190012, + "balance_loss_mlp": 1.16627765, + "epoch": 0.11273566756444786, + "flos": 510725689344.0, + "grad_norm": 0.09638114373338931, + "language_loss": 0.8707509, + "learning_rate": 0.0009821609609078697, + "loss": 0.88265103, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.23730469, + "step": 586, + "time_per_iteration": 2.6160855293273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192266, + "balance_loss_mlp": 1.16885376, + "epoch": 0.11292804924971142, + "flos": 622446280704.0, + "grad_norm": 0.08368187760832956, + "language_loss": 0.89230156, + "learning_rate": 0.0009820783919814045, + "loss": 0.90422428, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.23425293, + "step": 587, + "time_per_iteration": 2.8534207344055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168851, + "balance_loss_mlp": 1.14552212, + "epoch": 0.113120430934975, + "flos": 478056453120.0, + "grad_norm": 0.1429978790264596, + "language_loss": 0.82743758, + "learning_rate": 0.0009819956358955095, + "loss": 0.83912605, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.2331543, + "step": 588, + "time_per_iteration": 2.5901453495025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173293, + "balance_loss_mlp": 1.14966619, + "epoch": 0.11331281262023855, + "flos": 467039084544.0, + "grad_norm": 0.08588056281957461, + "language_loss": 0.84002471, + "learning_rate": 0.0009819126926823127, + "loss": 0.85175765, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.23608398, + "step": 589, + "time_per_iteration": 2.530374765396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202956, + "balance_loss_mlp": 1.17918611, + "epoch": 0.11350519430550211, + "flos": 650453151744.0, + "grad_norm": 0.07487704505114483, + "language_loss": 0.86892301, + "learning_rate": 0.000981829562374016, + "loss": 0.88095254, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.2376709, + "step": 590, + "time_per_iteration": 2.8030459880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244494, + "balance_loss_mlp": 1.22037804, + "epoch": 0.11369757599076567, + "flos": 557809680384.0, + "grad_norm": 0.12123010147526934, + "language_loss": 0.97345364, + "learning_rate": 0.0009817462450028933, + "loss": 0.98589861, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.24108887, + "step": 591, + "time_per_iteration": 2.7129569053649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233399, + "balance_loss_mlp": 1.20995021, + "epoch": 0.11388995767602925, + "flos": 571080222720.0, + "grad_norm": 0.08245604807530345, + "language_loss": 0.85052103, + "learning_rate": 0.0009816627406012916, + "loss": 0.86285496, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.23425293, + "step": 592, + "time_per_iteration": 2.8424665927886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218738, + "balance_loss_mlp": 1.19550395, + "epoch": 0.1140823393612928, + "flos": 740403307008.0, + "grad_norm": 0.128701110372521, + "language_loss": 0.84672415, + "learning_rate": 0.0009815790492016295, + "loss": 0.85891157, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.23217773, + "step": 593, + "time_per_iteration": 2.95451283454895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171348, + "balance_loss_mlp": 1.14887691, + "epoch": 0.11427472104655637, + "flos": 699004753920.0, + "grad_norm": 0.14505795416516268, + "language_loss": 0.86793518, + "learning_rate": 0.0009814951708363993, + "loss": 0.87964857, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.22473145, + "step": 594, + "time_per_iteration": 2.85953950881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125332, + "balance_loss_mlp": 1.11083615, + "epoch": 0.11446710273181993, + "flos": 1477178684928.0, + "grad_norm": 0.044045371588173315, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79116321, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.14453125, + "step": 595, + "time_per_iteration": 4.819102048873901 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116091, + "balance_loss_mlp": 1.09400165, + "epoch": 0.1146594844170835, + "flos": 494895080448.0, + "grad_norm": 0.15046985558242026, + "language_loss": 0.88265449, + "learning_rate": 0.0009813268533395648, + "loss": 0.8938154, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.2208252, + "step": 596, + "time_per_iteration": 2.5988821983337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127071, + "balance_loss_mlp": 1.10389698, + "epoch": 0.11485186610234706, + "flos": 474834534912.0, + "grad_norm": 0.12036284201424394, + "language_loss": 0.87534207, + "learning_rate": 0.0009812424142733073, + "loss": 0.88661277, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.23168945, + "step": 597, + "time_per_iteration": 2.5434508323669434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011341, + "balance_loss_mlp": 1.11084187, + "epoch": 0.11504424778761062, + "flos": 731209254912.0, + "grad_norm": 0.21736642596268407, + "language_loss": 0.85729969, + "learning_rate": 0.000981157788372175, + "loss": 0.86864072, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.23242188, + "step": 598, + "time_per_iteration": 3.0409185886383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140706, + "balance_loss_mlp": 1.11694789, + "epoch": 0.11523662947287418, + "flos": 545823567360.0, + "grad_norm": 0.09609751014588512, + "language_loss": 0.89140439, + "learning_rate": 0.0009810729756690223, + "loss": 0.90281147, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.23742676, + "step": 599, + "time_per_iteration": 2.7512025833129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149306, + "balance_loss_mlp": 1.12485611, + "epoch": 0.11542901115813775, + "flos": 775066558464.0, + "grad_norm": 0.09347854332414611, + "language_loss": 0.92009699, + "learning_rate": 0.0009809879761967766, + "loss": 0.93159008, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.24438477, + "step": 600, + "time_per_iteration": 2.966771364212036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114736, + "balance_loss_mlp": 1.1223377, + "epoch": 0.11562139284340131, + "flos": 730910449152.0, + "grad_norm": 0.11723124982013416, + "language_loss": 0.86307055, + "learning_rate": 0.0009809027899884378, + "loss": 0.87454414, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.25036621, + "step": 601, + "time_per_iteration": 2.960700273513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160833, + "balance_loss_mlp": 1.13693142, + "epoch": 0.11581377452866487, + "flos": 535878457344.0, + "grad_norm": 0.1190375758971125, + "language_loss": 0.88418448, + "learning_rate": 0.0009808174170770779, + "loss": 0.89579284, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.2388916, + "step": 602, + "time_per_iteration": 2.8176493644714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012435, + "balance_loss_mlp": 0.99622273, + "epoch": 0.11600615621392843, + "flos": 1555814863872.0, + "grad_norm": 0.011178693541089954, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.85910678, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.16210938, + "step": 603, + "time_per_iteration": 4.909565448760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265549, + "balance_loss_mlp": 1.24103987, + "epoch": 0.116198537899192, + "flos": 537435274752.0, + "grad_norm": 0.08512052059651275, + "language_loss": 0.93440074, + "learning_rate": 0.0009806461112779462, + "loss": 0.94705629, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.24511719, + "step": 604, + "time_per_iteration": 2.658644199371338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134615, + "balance_loss_mlp": 1.3188746, + "epoch": 0.11639091958445556, + "flos": 454203168768.0, + "grad_norm": 0.21802356099424494, + "language_loss": 0.87949467, + "learning_rate": 0.0009805601784566814, + "loss": 0.89295614, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.27294922, + "step": 605, + "time_per_iteration": 2.5276598930358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334827, + "balance_loss_mlp": 1.30897105, + "epoch": 0.11658330126971912, + "flos": 555081859584.0, + "grad_norm": 0.1053210941194693, + "language_loss": 0.95447874, + "learning_rate": 0.0009804740590654089, + "loss": 0.96782702, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.25854492, + "step": 606, + "time_per_iteration": 2.6621856689453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237167, + "balance_loss_mlp": 1.2128365, + "epoch": 0.11677568295498268, + "flos": 716340049920.0, + "grad_norm": 0.09607271254678196, + "language_loss": 0.89416385, + "learning_rate": 0.0009803877531375635, + "loss": 0.90653551, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.2434082, + "step": 607, + "time_per_iteration": 2.8813462257385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219293, + "balance_loss_mlp": 1.19459295, + "epoch": 0.11696806464024626, + "flos": 609758668800.0, + "grad_norm": 0.08760560664793143, + "language_loss": 0.90707058, + "learning_rate": 0.0009803012607066523, + "loss": 0.91926354, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.24707031, + "step": 608, + "time_per_iteration": 2.7780392169952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185589, + "balance_loss_mlp": 1.16223621, + "epoch": 0.11716044632550981, + "flos": 520384103424.0, + "grad_norm": 0.10290817733218703, + "language_loss": 0.89330381, + "learning_rate": 0.0009802145818062543, + "loss": 0.90515971, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.23339844, + "step": 609, + "time_per_iteration": 2.713611364364624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189107, + "balance_loss_mlp": 1.16636157, + "epoch": 0.11735282801077337, + "flos": 507493859328.0, + "grad_norm": 0.1057697966066493, + "language_loss": 0.91819966, + "learning_rate": 0.0009801277164700212, + "loss": 0.93009067, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.22741699, + "step": 610, + "time_per_iteration": 2.575333595275879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207403, + "balance_loss_mlp": 1.18378794, + "epoch": 0.11754520969603693, + "flos": 686638342656.0, + "grad_norm": 0.09616788336185009, + "language_loss": 0.89864278, + "learning_rate": 0.0009800406647316776, + "loss": 0.91071677, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.23608398, + "step": 611, + "time_per_iteration": 2.831953287124634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156372, + "balance_loss_mlp": 1.14006376, + "epoch": 0.1177375913813005, + "flos": 1542487421952.0, + "grad_norm": 0.06675579160113412, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.78070831, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.16308594, + "step": 612, + "time_per_iteration": 4.820984840393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252443, + "balance_loss_mlp": 1.22860086, + "epoch": 0.11792997306656407, + "flos": 520522495488.0, + "grad_norm": 0.12351306502077156, + "language_loss": 0.8851943, + "learning_rate": 0.000979866002183916, + "loss": 0.89771867, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.23815918, + "step": 613, + "time_per_iteration": 2.6552364826202393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233257, + "balance_loss_mlp": 1.20900965, + "epoch": 0.11812235475182763, + "flos": 666281189376.0, + "grad_norm": 0.09504576379881025, + "language_loss": 0.8953172, + "learning_rate": 0.0009797783914423082, + "loss": 0.90764976, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.24243164, + "step": 614, + "time_per_iteration": 2.8509650230407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120351, + "balance_loss_mlp": 1.18043077, + "epoch": 0.11831473643709119, + "flos": 621317122560.0, + "grad_norm": 0.09364161863028009, + "language_loss": 0.8453747, + "learning_rate": 0.0009796905944342094, + "loss": 0.85740978, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.23071289, + "step": 615, + "time_per_iteration": 2.8491313457489014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204695, + "balance_loss_mlp": 1.18137729, + "epoch": 0.11850711812235475, + "flos": 456688710144.0, + "grad_norm": 0.07677248067083364, + "language_loss": 0.88213146, + "learning_rate": 0.0009796026111937057, + "loss": 0.89417839, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.2331543, + "step": 616, + "time_per_iteration": 2.601902484893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165796, + "balance_loss_mlp": 1.14331329, + "epoch": 0.11869949980761832, + "flos": 513863543808.0, + "grad_norm": 0.0938738615494663, + "language_loss": 0.88620937, + "learning_rate": 0.0009795144417549552, + "loss": 0.89786732, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.22473145, + "step": 617, + "time_per_iteration": 2.7134363651275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168398, + "balance_loss_mlp": 1.14661872, + "epoch": 0.11889188149288188, + "flos": 535016171520.0, + "grad_norm": 0.10272804913481705, + "language_loss": 0.89757544, + "learning_rate": 0.0009794260861521883, + "loss": 0.90925944, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.21801758, + "step": 618, + "time_per_iteration": 2.831108331680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156602, + "balance_loss_mlp": 1.13393998, + "epoch": 0.11908426317814544, + "flos": 498603755520.0, + "grad_norm": 0.1607893611237687, + "language_loss": 0.87325203, + "learning_rate": 0.0009793375444197075, + "loss": 0.88481802, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.2265625, + "step": 619, + "time_per_iteration": 2.6383235454559326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174094, + "balance_loss_mlp": 1.15122962, + "epoch": 0.119276644863409, + "flos": 659891681280.0, + "grad_norm": 0.10347254391959168, + "language_loss": 0.85134327, + "learning_rate": 0.000979248816591888, + "loss": 0.8630842, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.22875977, + "step": 620, + "time_per_iteration": 2.7817084789276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186173, + "balance_loss_mlp": 1.16314173, + "epoch": 0.11946902654867257, + "flos": 758746621440.0, + "grad_norm": 0.09880033160570031, + "language_loss": 0.85983694, + "learning_rate": 0.0009791599027031766, + "loss": 0.87169874, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.23010254, + "step": 621, + "time_per_iteration": 3.0790488719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202902, + "balance_loss_mlp": 1.17933416, + "epoch": 0.11966140823393613, + "flos": 680999892480.0, + "grad_norm": 0.0888737424862181, + "language_loss": 0.85755396, + "learning_rate": 0.0009790708027880932, + "loss": 0.86958289, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.2355957, + "step": 622, + "time_per_iteration": 2.839409351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148001, + "balance_loss_mlp": 1.13073957, + "epoch": 0.11985378991919969, + "flos": 1451071853568.0, + "grad_norm": 0.05973140246409555, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78575295, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.17285156, + "step": 623, + "time_per_iteration": 4.827035665512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208498, + "balance_loss_mlp": 1.18456042, + "epoch": 0.12004617160446325, + "flos": 527848441344.0, + "grad_norm": 0.14072799304068395, + "language_loss": 0.92775166, + "learning_rate": 0.0009788920450172487, + "loss": 0.93983662, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.23925781, + "step": 624, + "time_per_iteration": 2.688457489013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186727, + "balance_loss_mlp": 1.16287279, + "epoch": 0.12023855328972682, + "flos": 474219297792.0, + "grad_norm": 0.09148145427830927, + "language_loss": 0.89981961, + "learning_rate": 0.0009788023872308875, + "loss": 0.9116869, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.23852539, + "step": 625, + "time_per_iteration": 2.5552427768707275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073346, + "balance_loss_mlp": 1.05656123, + "epoch": 0.12043093497499038, + "flos": 1531771430400.0, + "grad_norm": 0.03421346211042783, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76502347, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.16796875, + "step": 626, + "time_per_iteration": 4.845045804977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152939, + "balance_loss_mlp": 1.12972903, + "epoch": 0.12062331666025394, + "flos": 539839323648.0, + "grad_norm": 0.16289185985396562, + "language_loss": 0.93840104, + "learning_rate": 0.0009786225140303285, + "loss": 0.94993043, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.23217773, + "step": 627, + "time_per_iteration": 2.697042465209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167636, + "balance_loss_mlp": 1.14417565, + "epoch": 0.1208156983455175, + "flos": 511906604544.0, + "grad_norm": 0.2209026580633741, + "language_loss": 0.91874695, + "learning_rate": 0.0009785322986859634, + "loss": 0.93042338, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.23461914, + "step": 628, + "time_per_iteration": 2.6944122314453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153297, + "balance_loss_mlp": 1.12997985, + "epoch": 0.12100808003078108, + "flos": 596473072128.0, + "grad_norm": 0.07492335946827373, + "language_loss": 0.92751127, + "learning_rate": 0.0009784418975588838, + "loss": 0.93904424, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.23303223, + "step": 629, + "time_per_iteration": 2.7154979705810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156607, + "balance_loss_mlp": 1.1338973, + "epoch": 0.12120046171604464, + "flos": 522970960896.0, + "grad_norm": 0.07449655700779013, + "language_loss": 0.9307186, + "learning_rate": 0.0009783513106841862, + "loss": 0.9422847, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.22717285, + "step": 630, + "time_per_iteration": 2.704155921936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078203, + "balance_loss_mlp": 1.06208599, + "epoch": 0.1213928434013082, + "flos": 1554463249920.0, + "grad_norm": 0.048222043628353826, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77810907, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.16113281, + "step": 631, + "time_per_iteration": 4.9827399253845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186068, + "balance_loss_mlp": 1.16259575, + "epoch": 0.12158522508657175, + "flos": 495391749120.0, + "grad_norm": 0.0695579405445101, + "language_loss": 0.87454391, + "learning_rate": 0.0009781695798326854, + "loss": 0.88640457, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.23474121, + "step": 632, + "time_per_iteration": 2.6077868938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119723, + "balance_loss_mlp": 1.17401958, + "epoch": 0.12177760677183531, + "flos": 475585592832.0, + "grad_norm": 0.0874974071775435, + "language_loss": 0.87916714, + "learning_rate": 0.0009780784359264365, + "loss": 0.89113945, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.23205566, + "step": 633, + "time_per_iteration": 2.6383118629455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040346, + "balance_loss_mlp": 1.02403784, + "epoch": 0.12196998845709889, + "flos": 1468458906624.0, + "grad_norm": 0.031225790586482303, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75229043, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.16308594, + "step": 634, + "time_per_iteration": 4.7924864292144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217333, + "balance_loss_mlp": 1.19409907, + "epoch": 0.12216237014236245, + "flos": 586572378624.0, + "grad_norm": 0.07796234580729426, + "language_loss": 0.8718015, + "learning_rate": 0.000977895591329867, + "loss": 0.88397485, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.23205566, + "step": 635, + "time_per_iteration": 2.803107976913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234374, + "balance_loss_mlp": 1.21001959, + "epoch": 0.12235475182762601, + "flos": 597997582848.0, + "grad_norm": 0.11392323325170377, + "language_loss": 0.86567664, + "learning_rate": 0.000977803890710533, + "loss": 0.87802041, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.24304199, + "step": 636, + "time_per_iteration": 2.751648187637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120141, + "balance_loss_mlp": 1.17864108, + "epoch": 0.12254713351288957, + "flos": 497741469696.0, + "grad_norm": 0.07701221180236865, + "language_loss": 0.93102324, + "learning_rate": 0.0009777120045912774, + "loss": 0.94303727, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.2277832, + "step": 637, + "time_per_iteration": 2.691467761993408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186061, + "balance_loss_mlp": 1.16312516, + "epoch": 0.12273951519815314, + "flos": 605847361536.0, + "grad_norm": 0.08871868954386787, + "language_loss": 0.89725113, + "learning_rate": 0.0009776199330077736, + "loss": 0.90911174, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.22924805, + "step": 638, + "time_per_iteration": 2.7779197692871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117449, + "balance_loss_mlp": 1.15229297, + "epoch": 0.1229318968834167, + "flos": 597859190784.0, + "grad_norm": 0.08051745841053916, + "language_loss": 0.91847914, + "learning_rate": 0.0009775276759957667, + "loss": 0.93022406, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.2220459, + "step": 639, + "time_per_iteration": 2.8452744483947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170739, + "balance_loss_mlp": 1.14792228, + "epoch": 0.12312427856868026, + "flos": 678383299584.0, + "grad_norm": 0.0993028160053512, + "language_loss": 0.89413661, + "learning_rate": 0.0009774352335910745, + "loss": 0.90584403, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.22814941, + "step": 640, + "time_per_iteration": 2.8268258571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011491, + "balance_loss_mlp": 1.12753499, + "epoch": 0.12331666025394382, + "flos": 608933458944.0, + "grad_norm": 0.08449801570349542, + "language_loss": 0.9440136, + "learning_rate": 0.000977342605829586, + "loss": 0.9555046, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.21569824, + "step": 641, + "time_per_iteration": 2.7570323944091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162286, + "balance_loss_mlp": 1.13913512, + "epoch": 0.12350904193920739, + "flos": 762504855552.0, + "grad_norm": 0.11072842132379487, + "language_loss": 0.85702711, + "learning_rate": 0.0009772497927472623, + "loss": 0.86864996, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.23144531, + "step": 642, + "time_per_iteration": 3.1265273094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165192, + "balance_loss_mlp": 1.14213657, + "epoch": 0.12370142362447095, + "flos": 540968481792.0, + "grad_norm": 0.12556940690050455, + "language_loss": 0.84848756, + "learning_rate": 0.0009771567943801368, + "loss": 0.86013943, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.23034668, + "step": 643, + "time_per_iteration": 2.652181386947632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160262, + "balance_loss_mlp": 1.13739729, + "epoch": 0.12389380530973451, + "flos": 548128871424.0, + "grad_norm": 0.08337524575338892, + "language_loss": 0.8885237, + "learning_rate": 0.0009770636107643152, + "loss": 0.90012634, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.22851562, + "step": 644, + "time_per_iteration": 2.7387216091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165702, + "balance_loss_mlp": 1.14195597, + "epoch": 0.12408618699499807, + "flos": 540308828160.0, + "grad_norm": 0.19339175735102193, + "language_loss": 0.86818463, + "learning_rate": 0.0009769702419359738, + "loss": 0.87984169, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.23730469, + "step": 645, + "time_per_iteration": 2.6588823795318604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173541, + "balance_loss_mlp": 1.15027177, + "epoch": 0.12427856868026164, + "flos": 745792137216.0, + "grad_norm": 0.07743854144019968, + "language_loss": 0.88816965, + "learning_rate": 0.000976876687931362, + "loss": 0.89990509, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.23254395, + "step": 646, + "time_per_iteration": 3.0269463062286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143309, + "balance_loss_mlp": 1.1195029, + "epoch": 0.1244709503655252, + "flos": 533716687872.0, + "grad_norm": 0.09200303883175577, + "language_loss": 0.84307587, + "learning_rate": 0.0009767829487868005, + "loss": 0.85450894, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.23791504, + "step": 647, + "time_per_iteration": 2.652456045150757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136141, + "balance_loss_mlp": 1.11240613, + "epoch": 0.12466333205078876, + "flos": 508099184640.0, + "grad_norm": 0.0914449303971137, + "language_loss": 0.88396645, + "learning_rate": 0.000976689024538682, + "loss": 0.89532787, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.23718262, + "step": 648, + "time_per_iteration": 2.66267466545105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114222, + "balance_loss_mlp": 1.11798477, + "epoch": 0.12485571373605232, + "flos": 681345686016.0, + "grad_norm": 0.0994157560321478, + "language_loss": 0.86652195, + "learning_rate": 0.0009765949152234716, + "loss": 0.87794411, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.2421875, + "step": 649, + "time_per_iteration": 2.9676578044891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130303, + "balance_loss_mlp": 1.11628377, + "epoch": 0.1250480954213159, + "flos": 1330159781376.0, + "grad_norm": 0.046775068167293626, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79816383, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.140625, + "step": 650, + "time_per_iteration": 4.760566711425781 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117959, + "balance_loss_mlp": 1.1559155, + "epoch": 0.12524047710657946, + "flos": 938550758400.0, + "grad_norm": 0.09210474588463947, + "language_loss": 0.813963, + "learning_rate": 0.0009764061415379919, + "loss": 0.82575887, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.2364502, + "step": 651, + "time_per_iteration": 3.3511757850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120244, + "balance_loss_mlp": 1.17746568, + "epoch": 0.12543285879184302, + "flos": 513893279232.0, + "grad_norm": 0.09212981752556385, + "language_loss": 0.87756586, + "learning_rate": 0.0009763114772410109, + "loss": 0.88959026, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.24975586, + "step": 652, + "time_per_iteration": 2.5980827808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224224, + "balance_loss_mlp": 1.20058513, + "epoch": 0.12562524047710658, + "flos": 718328922624.0, + "grad_norm": 0.08737716532166849, + "language_loss": 0.86069119, + "learning_rate": 0.0009762166280235146, + "loss": 0.87293345, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.23632812, + "step": 653, + "time_per_iteration": 2.9842958450317383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01232055, + "balance_loss_mlp": 1.2083323, + "epoch": 0.12581762216237014, + "flos": 563712431616.0, + "grad_norm": 0.10849525216708464, + "language_loss": 0.86920303, + "learning_rate": 0.0009761215939223267, + "loss": 0.88152361, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.23706055, + "step": 654, + "time_per_iteration": 2.741058349609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120932, + "balance_loss_mlp": 1.18547845, + "epoch": 0.1260100038476337, + "flos": 481893608448.0, + "grad_norm": 0.12794458644218995, + "language_loss": 0.85666406, + "learning_rate": 0.0009760263749743428, + "loss": 0.86875725, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.23828125, + "step": 655, + "time_per_iteration": 2.5808663368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180222, + "balance_loss_mlp": 1.15707195, + "epoch": 0.12620238553289725, + "flos": 575555010048.0, + "grad_norm": 0.095199105706819, + "language_loss": 0.89238775, + "learning_rate": 0.0009759309712165299, + "loss": 0.90418994, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.23144531, + "step": 656, + "time_per_iteration": 2.748532295227051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181081, + "balance_loss_mlp": 1.15800261, + "epoch": 0.12639476721816084, + "flos": 531164335104.0, + "grad_norm": 0.10916020635653645, + "language_loss": 0.9220295, + "learning_rate": 0.0009758353826859272, + "loss": 0.93384039, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.23071289, + "step": 657, + "time_per_iteration": 2.595853805541992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177185, + "balance_loss_mlp": 1.15273547, + "epoch": 0.1265871489034244, + "flos": 689968917504.0, + "grad_norm": 0.12847037355320456, + "language_loss": 0.87952709, + "learning_rate": 0.0009757396094196456, + "loss": 0.89129901, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.24438477, + "step": 658, + "time_per_iteration": 2.8620266914367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203673, + "balance_loss_mlp": 1.17950892, + "epoch": 0.12677953058868796, + "flos": 537138667008.0, + "grad_norm": 0.07321655622824354, + "language_loss": 0.83431864, + "learning_rate": 0.0009756436514548673, + "loss": 0.84635538, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.24169922, + "step": 659, + "time_per_iteration": 2.912091016769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217457, + "balance_loss_mlp": 1.19229198, + "epoch": 0.12697191227395152, + "flos": 519022577664.0, + "grad_norm": 0.10055529179538837, + "language_loss": 0.8726669, + "learning_rate": 0.0009755475088288466, + "loss": 0.88484144, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.25183105, + "step": 660, + "time_per_iteration": 2.781341075897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243827, + "balance_loss_mlp": 1.218292, + "epoch": 0.12716429395921508, + "flos": 566605808640.0, + "grad_norm": 0.1174457122427187, + "language_loss": 0.88868487, + "learning_rate": 0.0009754511815789095, + "loss": 0.90112311, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.25537109, + "step": 661, + "time_per_iteration": 2.8132684230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246052, + "balance_loss_mlp": 1.21920574, + "epoch": 0.12735667564447864, + "flos": 514103251968.0, + "grad_norm": 0.09745592985886121, + "language_loss": 0.8455224, + "learning_rate": 0.0009753546697424533, + "loss": 0.85798287, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.26904297, + "step": 662, + "time_per_iteration": 2.7095847129821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01243151, + "balance_loss_mlp": 1.21792674, + "epoch": 0.1275490573297422, + "flos": 541282341888.0, + "grad_norm": 0.12502287201474796, + "language_loss": 0.89571029, + "learning_rate": 0.0009752579733569475, + "loss": 0.90814179, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.25244141, + "step": 663, + "time_per_iteration": 2.6534910202026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119538, + "balance_loss_mlp": 1.17935824, + "epoch": 0.12774143901500576, + "flos": 1558700900352.0, + "grad_norm": 0.048799046747725165, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7607677, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.16015625, + "step": 664, + "time_per_iteration": 4.974175453186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218811, + "balance_loss_mlp": 1.19439721, + "epoch": 0.12793382070026935, + "flos": 613744128000.0, + "grad_norm": 0.1143586633045421, + "language_loss": 0.88993388, + "learning_rate": 0.0009750640270890217, + "loss": 0.90212196, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.24401855, + "step": 665, + "time_per_iteration": 2.7663521766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124539, + "balance_loss_mlp": 1.22150016, + "epoch": 0.1281262023855329, + "flos": 707731499520.0, + "grad_norm": 0.11930184932546978, + "language_loss": 0.94833052, + "learning_rate": 0.0009749667772818983, + "loss": 0.96078444, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.23876953, + "step": 666, + "time_per_iteration": 3.01556134223938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120473, + "balance_loss_mlp": 1.10473776, + "epoch": 0.12831858407079647, + "flos": 1425034404864.0, + "grad_norm": 0.04410313188129877, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78056413, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.15722656, + "step": 667, + "time_per_iteration": 4.865432262420654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226701, + "balance_loss_mlp": 1.20370543, + "epoch": 0.12851096575606002, + "flos": 449098463232.0, + "grad_norm": 0.11041987280926156, + "language_loss": 0.94443977, + "learning_rate": 0.0009747717245101093, + "loss": 0.95670676, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.22998047, + "step": 668, + "time_per_iteration": 2.564667224884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217641, + "balance_loss_mlp": 1.19444275, + "epoch": 0.12870334744132358, + "flos": 479939240448.0, + "grad_norm": 0.0905963820135437, + "language_loss": 0.84166789, + "learning_rate": 0.00097467392162117, + "loss": 0.85384434, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.23193359, + "step": 669, + "time_per_iteration": 2.625565528869629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218894, + "balance_loss_mlp": 1.19641066, + "epoch": 0.12889572912658714, + "flos": 638936543232.0, + "grad_norm": 0.07707390480747152, + "language_loss": 0.90709603, + "learning_rate": 0.0009745759344474708, + "loss": 0.919285, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.22485352, + "step": 670, + "time_per_iteration": 2.828810691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210198, + "balance_loss_mlp": 1.18807316, + "epoch": 0.1290881108118507, + "flos": 509944896000.0, + "grad_norm": 0.1296418275033253, + "language_loss": 0.88266867, + "learning_rate": 0.0009744777630270536, + "loss": 0.89477074, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.22119141, + "step": 671, + "time_per_iteration": 2.5931460857391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205012, + "balance_loss_mlp": 1.18351889, + "epoch": 0.12928049249711426, + "flos": 671054782464.0, + "grad_norm": 0.16263031414063664, + "language_loss": 0.92705458, + "learning_rate": 0.000974379407398032, + "loss": 0.93910474, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.21508789, + "step": 672, + "time_per_iteration": 2.947148323059082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208232, + "balance_loss_mlp": 1.18665552, + "epoch": 0.12947287418237785, + "flos": 793525870080.0, + "grad_norm": 0.09135110996657969, + "language_loss": 0.81593442, + "learning_rate": 0.0009742808675985913, + "loss": 0.82801676, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.21594238, + "step": 673, + "time_per_iteration": 3.179880380630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223794, + "balance_loss_mlp": 1.20184779, + "epoch": 0.1296652558676414, + "flos": 485466462720.0, + "grad_norm": 0.08798796705409132, + "language_loss": 0.89740491, + "learning_rate": 0.0009741821436669876, + "loss": 0.90964288, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.21948242, + "step": 674, + "time_per_iteration": 2.5925161838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230582, + "balance_loss_mlp": 1.20812273, + "epoch": 0.12985763755290497, + "flos": 453459451392.0, + "grad_norm": 0.13739173158435178, + "language_loss": 0.91820276, + "learning_rate": 0.0009740832356415492, + "loss": 0.93050855, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.22473145, + "step": 675, + "time_per_iteration": 2.5184531211853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223494, + "balance_loss_mlp": 1.20120144, + "epoch": 0.13005001923816853, + "flos": 825061178880.0, + "grad_norm": 0.10341661200692882, + "language_loss": 0.87010336, + "learning_rate": 0.0009739841435606756, + "loss": 0.88233835, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.22290039, + "step": 676, + "time_per_iteration": 3.0507655143737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207183, + "balance_loss_mlp": 1.18511748, + "epoch": 0.1302424009234321, + "flos": 531381648384.0, + "grad_norm": 0.08057490768106465, + "language_loss": 0.89111441, + "learning_rate": 0.0009738848674628377, + "loss": 0.90318626, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.2208252, + "step": 677, + "time_per_iteration": 2.745363235473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121617, + "balance_loss_mlp": 1.19430709, + "epoch": 0.13043478260869565, + "flos": 525884161536.0, + "grad_norm": 0.0856975246411629, + "language_loss": 0.88498092, + "learning_rate": 0.000973785407386578, + "loss": 0.89714259, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.21862793, + "step": 678, + "time_per_iteration": 2.778620958328247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214606, + "balance_loss_mlp": 1.191324, + "epoch": 0.1306271642939592, + "flos": 626172208128.0, + "grad_norm": 0.06828211935324495, + "language_loss": 0.86676407, + "learning_rate": 0.0009736857633705103, + "loss": 0.87891012, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.23266602, + "step": 679, + "time_per_iteration": 2.9231183528900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209443, + "balance_loss_mlp": 1.18695986, + "epoch": 0.13081954597922277, + "flos": 550718300160.0, + "grad_norm": 0.0834800111741461, + "language_loss": 0.92100477, + "learning_rate": 0.0009735859354533196, + "loss": 0.93309915, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.22460938, + "step": 680, + "time_per_iteration": 2.775928258895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195854, + "balance_loss_mlp": 1.17248893, + "epoch": 0.13101192766448633, + "flos": 536911441920.0, + "grad_norm": 0.10927888529927046, + "language_loss": 0.91257143, + "learning_rate": 0.0009734859236737628, + "loss": 0.92453003, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.23339844, + "step": 681, + "time_per_iteration": 2.684873342514038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171844, + "balance_loss_mlp": 1.1486578, + "epoch": 0.13120430934974991, + "flos": 503508400128.0, + "grad_norm": 0.1264690256888091, + "language_loss": 0.92692226, + "learning_rate": 0.0009733857280706678, + "loss": 0.93864071, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.23168945, + "step": 682, + "time_per_iteration": 2.6460657119750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174213, + "balance_loss_mlp": 1.15156293, + "epoch": 0.13139669103501347, + "flos": 614295124992.0, + "grad_norm": 0.10018713039090629, + "language_loss": 0.83565485, + "learning_rate": 0.000973285348682934, + "loss": 0.84739697, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.2265625, + "step": 683, + "time_per_iteration": 2.758242607116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114811, + "balance_loss_mlp": 1.13504481, + "epoch": 0.13158907272027703, + "flos": 1484971564032.0, + "grad_norm": 0.05076292773380049, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79046488, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.13085938, + "step": 684, + "time_per_iteration": 4.8192243576049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204502, + "balance_loss_mlp": 1.17974257, + "epoch": 0.1317814544055406, + "flos": 985461852672.0, + "grad_norm": 0.1066751932733185, + "language_loss": 0.84567851, + "learning_rate": 0.0009730840387095046, + "loss": 0.85772359, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.24768066, + "step": 685, + "time_per_iteration": 3.3115832805633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227082, + "balance_loss_mlp": 1.20198846, + "epoch": 0.13197383609080415, + "flos": 611456076288.0, + "grad_norm": 0.07078465407426249, + "language_loss": 0.90421009, + "learning_rate": 0.0009729831082019642, + "loss": 0.9164809, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.25097656, + "step": 686, + "time_per_iteration": 2.8678879737854004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252204, + "balance_loss_mlp": 1.22750425, + "epoch": 0.1321662177760677, + "flos": 494403181056.0, + "grad_norm": 0.09776828955155538, + "language_loss": 0.8801111, + "learning_rate": 0.0009728819940660958, + "loss": 0.89263314, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.24707031, + "step": 687, + "time_per_iteration": 2.7938969135284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263825, + "balance_loss_mlp": 1.23863578, + "epoch": 0.13235859946133127, + "flos": 495841430016.0, + "grad_norm": 0.10048379585191887, + "language_loss": 0.84283459, + "learning_rate": 0.0009727806963411557, + "loss": 0.8554728, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.25195312, + "step": 688, + "time_per_iteration": 2.607588529586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239051, + "balance_loss_mlp": 1.2133261, + "epoch": 0.13255098114659483, + "flos": 511686720000.0, + "grad_norm": 0.08603068006049115, + "language_loss": 0.8672629, + "learning_rate": 0.000972679215066471, + "loss": 0.87965345, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.25756836, + "step": 689, + "time_per_iteration": 2.7422516345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224087, + "balance_loss_mlp": 1.19882667, + "epoch": 0.13274336283185842, + "flos": 547370472960.0, + "grad_norm": 0.10287089436887557, + "language_loss": 0.9870705, + "learning_rate": 0.0009725775502814401, + "loss": 0.99931133, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.25268555, + "step": 690, + "time_per_iteration": 2.5919952392578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192013, + "balance_loss_mlp": 1.16732466, + "epoch": 0.13293574451712198, + "flos": 640772342784.0, + "grad_norm": 0.1091756570575493, + "language_loss": 0.84613961, + "learning_rate": 0.0009724757020255327, + "loss": 0.85805976, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.2467041, + "step": 691, + "time_per_iteration": 2.851348400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011849, + "balance_loss_mlp": 1.15994906, + "epoch": 0.13312812620238554, + "flos": 491480441856.0, + "grad_norm": 0.0968423296469171, + "language_loss": 0.86866987, + "learning_rate": 0.0009723736703382902, + "loss": 0.88051891, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.24951172, + "step": 692, + "time_per_iteration": 2.5881834030151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179974, + "balance_loss_mlp": 1.15652537, + "epoch": 0.1333205078876491, + "flos": 508944218112.0, + "grad_norm": 0.10463911515585092, + "language_loss": 0.82742584, + "learning_rate": 0.0009722714552593244, + "loss": 0.83922553, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.23413086, + "step": 693, + "time_per_iteration": 2.6343894004821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186752, + "balance_loss_mlp": 1.16344643, + "epoch": 0.13351288957291266, + "flos": 418697455104.0, + "grad_norm": 0.12210775976205426, + "language_loss": 0.93531036, + "learning_rate": 0.000972169056828319, + "loss": 0.94717789, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.23303223, + "step": 694, + "time_per_iteration": 2.4834342002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183255, + "balance_loss_mlp": 1.16046166, + "epoch": 0.13370527125817622, + "flos": 615901128192.0, + "grad_norm": 0.08175934073664855, + "language_loss": 0.87263072, + "learning_rate": 0.0009720664750850283, + "loss": 0.88446331, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.22790527, + "step": 695, + "time_per_iteration": 2.796005964279175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191523, + "balance_loss_mlp": 1.16836047, + "epoch": 0.13389765294343978, + "flos": 626038958592.0, + "grad_norm": 0.0918947132133249, + "language_loss": 0.92442453, + "learning_rate": 0.0009719637100692784, + "loss": 0.9363398, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.23168945, + "step": 696, + "time_per_iteration": 2.7338545322418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173282, + "balance_loss_mlp": 1.15093064, + "epoch": 0.13409003462870334, + "flos": 609691857408.0, + "grad_norm": 0.09425790223988205, + "language_loss": 0.82822204, + "learning_rate": 0.0009718607618209661, + "loss": 0.83995485, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.22351074, + "step": 697, + "time_per_iteration": 2.8834567070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167823, + "balance_loss_mlp": 1.14468443, + "epoch": 0.13428241631396692, + "flos": 683816546304.0, + "grad_norm": 0.07380520807835853, + "language_loss": 0.87331033, + "learning_rate": 0.0009717576303800595, + "loss": 0.88498855, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.23120117, + "step": 698, + "time_per_iteration": 3.0662593841552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189182, + "balance_loss_mlp": 1.1649704, + "epoch": 0.13447479799923048, + "flos": 508815737856.0, + "grad_norm": 0.08733354578890483, + "language_loss": 0.85059655, + "learning_rate": 0.0009716543157865975, + "loss": 0.86248839, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.24182129, + "step": 699, + "time_per_iteration": 2.7156968116760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210396, + "balance_loss_mlp": 1.1879611, + "epoch": 0.13466717968449404, + "flos": 897510481920.0, + "grad_norm": 0.08759306221047211, + "language_loss": 0.82954025, + "learning_rate": 0.0009715508180806907, + "loss": 0.84164423, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.22436523, + "step": 700, + "time_per_iteration": 3.204936981201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209609, + "balance_loss_mlp": 1.18669748, + "epoch": 0.1348595613697576, + "flos": 989938838016.0, + "grad_norm": 0.07843453256975112, + "language_loss": 0.89359999, + "learning_rate": 0.0009714471373025202, + "loss": 0.90569609, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.22900391, + "step": 701, + "time_per_iteration": 3.4600374698638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186239, + "balance_loss_mlp": 1.16323161, + "epoch": 0.13505194305502116, + "flos": 487826095104.0, + "grad_norm": 0.07505390512906053, + "language_loss": 0.88395512, + "learning_rate": 0.0009713432734923386, + "loss": 0.89581752, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.23010254, + "step": 702, + "time_per_iteration": 2.638005018234253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173659, + "balance_loss_mlp": 1.15109301, + "epoch": 0.13524432474028472, + "flos": 613385851392.0, + "grad_norm": 0.09376344684626736, + "language_loss": 0.86520576, + "learning_rate": 0.0009712392266904696, + "loss": 0.8769424, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.22558594, + "step": 703, + "time_per_iteration": 2.7503063678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116976, + "balance_loss_mlp": 1.14838624, + "epoch": 0.13543670642554828, + "flos": 904794582528.0, + "grad_norm": 0.18430775331568308, + "language_loss": 0.85049546, + "learning_rate": 0.0009711349969373076, + "loss": 0.86219305, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.21386719, + "step": 704, + "time_per_iteration": 3.1815178394317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166202, + "balance_loss_mlp": 1.14376664, + "epoch": 0.13562908811081184, + "flos": 550616984064.0, + "grad_norm": 0.08099598593900344, + "language_loss": 0.80275941, + "learning_rate": 0.0009710305842733178, + "loss": 0.81442142, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.22436523, + "step": 705, + "time_per_iteration": 2.7307353019714355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152043, + "balance_loss_mlp": 1.13138402, + "epoch": 0.1358214697960754, + "flos": 508044856320.0, + "grad_norm": 0.08979940018627898, + "language_loss": 0.89208561, + "learning_rate": 0.0009709259887390373, + "loss": 0.90360606, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.20666504, + "step": 706, + "time_per_iteration": 2.6135804653167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160605, + "balance_loss_mlp": 1.13901603, + "epoch": 0.136013851481339, + "flos": 528896107008.0, + "grad_norm": 0.11609486524108804, + "language_loss": 0.9066751, + "learning_rate": 0.0009708212103750737, + "loss": 0.91828114, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.21606445, + "step": 707, + "time_per_iteration": 2.632742166519165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185341, + "balance_loss_mlp": 1.16383576, + "epoch": 0.13620623316660255, + "flos": 659081152512.0, + "grad_norm": 0.10488018026765993, + "language_loss": 0.86886567, + "learning_rate": 0.0009707162492221051, + "loss": 0.88071907, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.21508789, + "step": 708, + "time_per_iteration": 2.9155325889587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221514, + "balance_loss_mlp": 1.19948387, + "epoch": 0.1363986148518661, + "flos": 671882563584.0, + "grad_norm": 0.11565397704484869, + "language_loss": 0.87553132, + "learning_rate": 0.0009706111053208815, + "loss": 0.88774645, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.22058105, + "step": 709, + "time_per_iteration": 2.843981981277466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233371, + "balance_loss_mlp": 1.21016061, + "epoch": 0.13659099653712967, + "flos": 473062975488.0, + "grad_norm": 0.10007182380605975, + "language_loss": 0.85645008, + "learning_rate": 0.0009705057787122232, + "loss": 0.86878371, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.23193359, + "step": 710, + "time_per_iteration": 2.594890832901001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195501, + "balance_loss_mlp": 1.17281508, + "epoch": 0.13678337822239323, + "flos": 452715734016.0, + "grad_norm": 0.08836610284298578, + "language_loss": 0.90505099, + "learning_rate": 0.0009704002694370216, + "loss": 0.91700602, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.22680664, + "step": 711, + "time_per_iteration": 2.5702362060546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117145, + "balance_loss_mlp": 1.14863288, + "epoch": 0.13697575990765679, + "flos": 519623133696.0, + "grad_norm": 0.11670756159189942, + "language_loss": 0.86028767, + "learning_rate": 0.0009702945775362388, + "loss": 0.87200224, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.22802734, + "step": 712, + "time_per_iteration": 2.6679470539093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149984, + "balance_loss_mlp": 1.12776387, + "epoch": 0.13716814159292035, + "flos": 480388921344.0, + "grad_norm": 0.10271253203083616, + "language_loss": 0.86890107, + "learning_rate": 0.0009701887030509086, + "loss": 0.8804009, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.22229004, + "step": 713, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112596, + "balance_loss_mlp": 1.1041683, + "epoch": 0.1373605232781839, + "flos": 545650670592.0, + "grad_norm": 0.09375417211048337, + "language_loss": 0.90942538, + "learning_rate": 0.0009700826460221346, + "loss": 0.92068493, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.21801758, + "step": 714, + "time_per_iteration": 2.7277417182922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133315, + "balance_loss_mlp": 1.11104631, + "epoch": 0.1375529049634475, + "flos": 708791648256.0, + "grad_norm": 0.1250628990201497, + "language_loss": 0.92436254, + "learning_rate": 0.0009699764064910921, + "loss": 0.93569565, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.22265625, + "step": 715, + "time_per_iteration": 2.900053024291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129357, + "balance_loss_mlp": 1.10697007, + "epoch": 0.13774528664871105, + "flos": 486696936960.0, + "grad_norm": 0.18348866981846054, + "language_loss": 0.86833155, + "learning_rate": 0.0009698699844990268, + "loss": 0.87962508, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.22387695, + "step": 716, + "time_per_iteration": 2.645792245864868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136392, + "balance_loss_mlp": 1.11483872, + "epoch": 0.1379376683339746, + "flos": 680199275520.0, + "grad_norm": 0.08476879745046602, + "language_loss": 0.87948525, + "learning_rate": 0.0009697633800872555, + "loss": 0.89084923, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.21557617, + "step": 717, + "time_per_iteration": 2.9197771549224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153248, + "balance_loss_mlp": 1.13183844, + "epoch": 0.13813005001923817, + "flos": 610946924544.0, + "grad_norm": 0.08051298122060387, + "language_loss": 0.90472651, + "learning_rate": 0.0009696565932971655, + "loss": 0.91625893, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.2142334, + "step": 718, + "time_per_iteration": 2.9118661880493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157354, + "balance_loss_mlp": 1.1350143, + "epoch": 0.13832243170450173, + "flos": 588729378816.0, + "grad_norm": 0.09173992406124648, + "language_loss": 0.897349, + "learning_rate": 0.0009695496241702153, + "loss": 0.90892255, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.22338867, + "step": 719, + "time_per_iteration": 2.8108739852905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184496, + "balance_loss_mlp": 1.16145301, + "epoch": 0.1385148133897653, + "flos": 700002860544.0, + "grad_norm": 0.09716749239115424, + "language_loss": 0.85599422, + "learning_rate": 0.0009694424727479339, + "loss": 0.86783922, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.23034668, + "step": 720, + "time_per_iteration": 2.9078242778778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190722, + "balance_loss_mlp": 1.16825104, + "epoch": 0.13870719507502885, + "flos": 598254543360.0, + "grad_norm": 0.08276113558291018, + "language_loss": 0.88687241, + "learning_rate": 0.0009693351390719213, + "loss": 0.89877963, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.22473145, + "step": 721, + "time_per_iteration": 2.727829933166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214778, + "balance_loss_mlp": 1.19178224, + "epoch": 0.1388995767602924, + "flos": 586572378624.0, + "grad_norm": 0.08055125516722848, + "language_loss": 0.9053812, + "learning_rate": 0.000969227623183848, + "loss": 0.91752893, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.22998047, + "step": 722, + "time_per_iteration": 2.8233954906463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202462, + "balance_loss_mlp": 1.17980003, + "epoch": 0.139091958445556, + "flos": 651120145920.0, + "grad_norm": 0.06957111358845897, + "language_loss": 0.90902817, + "learning_rate": 0.0009691199251254554, + "loss": 0.92105281, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.22668457, + "step": 723, + "time_per_iteration": 2.838449001312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188398, + "balance_loss_mlp": 1.16651106, + "epoch": 0.13928434013081956, + "flos": 575737818624.0, + "grad_norm": 0.08029384244148012, + "language_loss": 0.86382651, + "learning_rate": 0.0009690120449385555, + "loss": 0.87571049, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.21899414, + "step": 724, + "time_per_iteration": 2.7877347469329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191299, + "balance_loss_mlp": 1.16917384, + "epoch": 0.13947672181608312, + "flos": 563225674752.0, + "grad_norm": 0.113442270614728, + "language_loss": 0.92300928, + "learning_rate": 0.0009689039826650312, + "loss": 0.93492222, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.22131348, + "step": 725, + "time_per_iteration": 2.8086507320404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219435, + "balance_loss_mlp": 1.20293677, + "epoch": 0.13966910350134668, + "flos": 1521546964992.0, + "grad_norm": 0.07583456833656638, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77742493, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.16503906, + "step": 726, + "time_per_iteration": 4.891220331192017 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177839, + "balance_loss_mlp": 1.15583265, + "epoch": 0.13986148518661023, + "flos": 499854053376.0, + "grad_norm": 0.4935698294407845, + "language_loss": 0.86680418, + "learning_rate": 0.0009686873120259941, + "loss": 0.8785826, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.22021484, + "step": 727, + "time_per_iteration": 2.584016799926758 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220836, + "balance_loss_mlp": 1.19853175, + "epoch": 0.1400538668718738, + "flos": 598674488832.0, + "grad_norm": 0.12530325225106098, + "language_loss": 0.86788189, + "learning_rate": 0.0009685787037446004, + "loss": 0.88009018, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.22314453, + "step": 728, + "time_per_iteration": 2.7812938690185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256231, + "balance_loss_mlp": 1.2321384, + "epoch": 0.14024624855713735, + "flos": 594039287808.0, + "grad_norm": 0.19184429152401888, + "language_loss": 0.86789989, + "learning_rate": 0.0009684699135448201, + "loss": 0.88046223, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.24072266, + "step": 729, + "time_per_iteration": 2.7354156970977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316519, + "balance_loss_mlp": 1.29105544, + "epoch": 0.1404386302424009, + "flos": 506584585728.0, + "grad_norm": 0.08142335105414879, + "language_loss": 0.91990757, + "learning_rate": 0.0009683609414688895, + "loss": 0.93307269, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.25463867, + "step": 730, + "time_per_iteration": 2.7542572021484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396811, + "balance_loss_mlp": 1.36896372, + "epoch": 0.14063101192766447, + "flos": 573407921664.0, + "grad_norm": 0.09882218945586521, + "language_loss": 0.86064744, + "learning_rate": 0.0009682517875591154, + "loss": 0.87461555, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.27856445, + "step": 731, + "time_per_iteration": 2.7971835136413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01440007, + "balance_loss_mlp": 1.41070533, + "epoch": 0.14082339361292806, + "flos": 564619133952.0, + "grad_norm": 0.11775580833602758, + "language_loss": 0.85772473, + "learning_rate": 0.0009681424518578749, + "loss": 0.87212479, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.29248047, + "step": 732, + "time_per_iteration": 2.742525100708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01460191, + "balance_loss_mlp": 1.43045998, + "epoch": 0.14101577529819162, + "flos": 463584798720.0, + "grad_norm": 0.14540211876107528, + "language_loss": 0.87523216, + "learning_rate": 0.000968032934407616, + "loss": 0.88983405, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.296875, + "step": 733, + "time_per_iteration": 2.586650848388672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01389602, + "balance_loss_mlp": 1.35989547, + "epoch": 0.14120815698345518, + "flos": 596085060096.0, + "grad_norm": 0.09505428174523772, + "language_loss": 0.81872886, + "learning_rate": 0.0009679232352508571, + "loss": 0.83262491, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.29711914, + "step": 734, + "time_per_iteration": 2.8065295219421387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337262, + "balance_loss_mlp": 1.30776978, + "epoch": 0.14140053866871874, + "flos": 535137311232.0, + "grad_norm": 0.08594729931011787, + "language_loss": 0.8053807, + "learning_rate": 0.0009678133544301871, + "loss": 0.8187533, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.29492188, + "step": 735, + "time_per_iteration": 2.681156635284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290781, + "balance_loss_mlp": 1.26231337, + "epoch": 0.1415929203539823, + "flos": 520265534976.0, + "grad_norm": 0.07917660118228964, + "language_loss": 0.91284931, + "learning_rate": 0.0009677032919882658, + "loss": 0.92575711, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.28442383, + "step": 736, + "time_per_iteration": 2.701876163482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267129, + "balance_loss_mlp": 1.2393055, + "epoch": 0.14178530203924586, + "flos": 482335948800.0, + "grad_norm": 0.11161795715290385, + "language_loss": 0.91632634, + "learning_rate": 0.000967593047967823, + "loss": 0.92899764, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.27832031, + "step": 737, + "time_per_iteration": 2.549489736557007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01257034, + "balance_loss_mlp": 1.22987819, + "epoch": 0.14197768372450942, + "flos": 676638904320.0, + "grad_norm": 0.11515852654264594, + "language_loss": 0.86905932, + "learning_rate": 0.0009674826224116593, + "loss": 0.88162971, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.27160645, + "step": 738, + "time_per_iteration": 2.8459107875823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01274254, + "balance_loss_mlp": 1.24875474, + "epoch": 0.14217006540977298, + "flos": 446039529984.0, + "grad_norm": 0.086163857469945, + "language_loss": 0.8627907, + "learning_rate": 0.0009673720153626455, + "loss": 0.87553322, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.25512695, + "step": 739, + "time_per_iteration": 2.6033051013946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01298128, + "balance_loss_mlp": 1.27345169, + "epoch": 0.14236244709503657, + "flos": 496503654912.0, + "grad_norm": 0.07922284002741106, + "language_loss": 0.8672145, + "learning_rate": 0.0009672612268637235, + "loss": 0.88019574, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.2467041, + "step": 740, + "time_per_iteration": 2.639249801635742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331294, + "balance_loss_mlp": 1.30575967, + "epoch": 0.14255482878030012, + "flos": 648313403904.0, + "grad_norm": 0.09083563941739939, + "language_loss": 0.84015429, + "learning_rate": 0.0009671502569579048, + "loss": 0.85346723, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.25537109, + "step": 741, + "time_per_iteration": 2.784358263015747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01372438, + "balance_loss_mlp": 1.34778547, + "epoch": 0.14274721046556368, + "flos": 536165153280.0, + "grad_norm": 0.08785871424370759, + "language_loss": 0.89829892, + "learning_rate": 0.0009670391056882719, + "loss": 0.91202337, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.2467041, + "step": 742, + "time_per_iteration": 2.765284299850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01384139, + "balance_loss_mlp": 1.35946321, + "epoch": 0.14293959215082724, + "flos": 957057431040.0, + "grad_norm": 0.09890816943940939, + "language_loss": 0.88263386, + "learning_rate": 0.0009669277730979776, + "loss": 0.89647526, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.24694824, + "step": 743, + "time_per_iteration": 3.2124171257019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01409259, + "balance_loss_mlp": 1.38365269, + "epoch": 0.1431319738360908, + "flos": 693089519616.0, + "grad_norm": 0.08939291923456745, + "language_loss": 0.85339808, + "learning_rate": 0.0009668162592302449, + "loss": 0.86749065, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.25610352, + "step": 744, + "time_per_iteration": 2.947239398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01413521, + "balance_loss_mlp": 1.38784337, + "epoch": 0.14332435552135436, + "flos": 565439574528.0, + "grad_norm": 0.12486964956779355, + "language_loss": 0.86141676, + "learning_rate": 0.0009667045641283676, + "loss": 0.87555194, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.25708008, + "step": 745, + "time_per_iteration": 2.67399001121521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01345232, + "balance_loss_mlp": 1.32049656, + "epoch": 0.14351673720661792, + "flos": 738374787072.0, + "grad_norm": 0.09833561966825685, + "language_loss": 0.94721901, + "learning_rate": 0.0009665926878357092, + "loss": 0.96067131, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.24743652, + "step": 746, + "time_per_iteration": 2.951524257659912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308325, + "balance_loss_mlp": 1.28470945, + "epoch": 0.14370911889188148, + "flos": 549230865408.0, + "grad_norm": 0.09374380516730212, + "language_loss": 0.90804815, + "learning_rate": 0.0009664806303957043, + "loss": 0.92113143, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.23608398, + "step": 747, + "time_per_iteration": 2.7018370628356934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01290979, + "balance_loss_mlp": 1.26711321, + "epoch": 0.14390150057714507, + "flos": 590295734784.0, + "grad_norm": 0.09976705309421963, + "language_loss": 0.87274301, + "learning_rate": 0.0009663683918518571, + "loss": 0.88565284, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.23840332, + "step": 748, + "time_per_iteration": 2.9669973850250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260109, + "balance_loss_mlp": 1.23742342, + "epoch": 0.14409388226240863, + "flos": 591047165952.0, + "grad_norm": 0.09601459473111058, + "language_loss": 0.85172814, + "learning_rate": 0.0009662559722477428, + "loss": 0.86432928, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.22680664, + "step": 749, + "time_per_iteration": 2.692737579345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01313989, + "balance_loss_mlp": 1.2952019, + "epoch": 0.1442862639476722, + "flos": 1511263401984.0, + "grad_norm": 0.07630612016334831, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77476966, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.1875, + "step": 750, + "time_per_iteration": 5.012727975845337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203648, + "balance_loss_mlp": 1.18093836, + "epoch": 0.14447864563293575, + "flos": 496765384704.0, + "grad_norm": 0.10872642357348963, + "language_loss": 0.88863885, + "learning_rate": 0.0009660305900333632, + "loss": 0.90067536, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.22705078, + "step": 751, + "time_per_iteration": 2.715942859649658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173383, + "balance_loss_mlp": 1.15045881, + "epoch": 0.1446710273181993, + "flos": 589678299648.0, + "grad_norm": 0.08046883529286915, + "language_loss": 0.82496673, + "learning_rate": 0.0009659176275105992, + "loss": 0.83670056, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.22924805, + "step": 752, + "time_per_iteration": 2.713360071182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180444, + "balance_loss_mlp": 1.15698361, + "epoch": 0.14486340900346287, + "flos": 585818749440.0, + "grad_norm": 0.07494959784402849, + "language_loss": 0.85518491, + "learning_rate": 0.0009658044841025701, + "loss": 0.86698937, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.23425293, + "step": 753, + "time_per_iteration": 2.7982797622680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117179, + "balance_loss_mlp": 1.14774585, + "epoch": 0.14505579068872643, + "flos": 504672062976.0, + "grad_norm": 0.10908868033385523, + "language_loss": 0.81575012, + "learning_rate": 0.0009656911598532021, + "loss": 0.82746804, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.24023438, + "step": 754, + "time_per_iteration": 2.642843246459961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216386, + "balance_loss_mlp": 1.19192445, + "epoch": 0.14524817237399, + "flos": 486815505408.0, + "grad_norm": 0.08024204468384731, + "language_loss": 0.89968902, + "learning_rate": 0.0009655776548064917, + "loss": 0.91185284, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.24462891, + "step": 755, + "time_per_iteration": 2.6598751544952393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240024, + "balance_loss_mlp": 1.2152878, + "epoch": 0.14544055405925355, + "flos": 728175287808.0, + "grad_norm": 0.0778788297064716, + "language_loss": 0.88022745, + "learning_rate": 0.0009654639690065054, + "loss": 0.89262772, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.24743652, + "step": 756, + "time_per_iteration": 2.8861637115478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126465, + "balance_loss_mlp": 1.2393297, + "epoch": 0.14563293574451713, + "flos": 593643935232.0, + "grad_norm": 0.09020306103656467, + "language_loss": 0.87895447, + "learning_rate": 0.00096535010249738, + "loss": 0.89160097, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.25317383, + "step": 757, + "time_per_iteration": 2.7438864707946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270837, + "balance_loss_mlp": 1.24456334, + "epoch": 0.1458253174297807, + "flos": 560478030336.0, + "grad_norm": 0.12633601395220453, + "language_loss": 0.82038969, + "learning_rate": 0.0009652360553233224, + "loss": 0.83309805, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.26318359, + "step": 758, + "time_per_iteration": 2.7446844577789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210641, + "balance_loss_mlp": 1.18994594, + "epoch": 0.14601769911504425, + "flos": 1557855866880.0, + "grad_norm": 0.05582061662785393, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.7498439, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.20703125, + "step": 759, + "time_per_iteration": 4.942702054977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212546, + "balance_loss_mlp": 1.18641555, + "epoch": 0.1462100808003078, + "flos": 866301516288.0, + "grad_norm": 0.06567012775246582, + "language_loss": 0.81178761, + "learning_rate": 0.0009650074191575883, + "loss": 0.8239131, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.26171875, + "step": 760, + "time_per_iteration": 3.2085912227630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198598, + "balance_loss_mlp": 1.17261064, + "epoch": 0.14640246248557137, + "flos": 522943796736.0, + "grad_norm": 0.07877672537318793, + "language_loss": 0.85659027, + "learning_rate": 0.0009648928302546766, + "loss": 0.86857623, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.26013184, + "step": 761, + "time_per_iteration": 2.7206709384918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176065, + "balance_loss_mlp": 1.15095961, + "epoch": 0.14659484417083493, + "flos": 1030544487936.0, + "grad_norm": 0.07899561323852963, + "language_loss": 0.85068321, + "learning_rate": 0.0009647780608643613, + "loss": 0.86244392, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.25109863, + "step": 762, + "time_per_iteration": 3.4438586235046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170525, + "balance_loss_mlp": 1.14620686, + "epoch": 0.1467872258560985, + "flos": 500671922688.0, + "grad_norm": 0.141987740723005, + "language_loss": 0.87758678, + "learning_rate": 0.0009646631110312001, + "loss": 0.88929206, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.24304199, + "step": 763, + "time_per_iteration": 2.6546902656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152908, + "balance_loss_mlp": 1.12836289, + "epoch": 0.14697960754136205, + "flos": 547797758976.0, + "grad_norm": 0.07748728130668867, + "language_loss": 0.88344562, + "learning_rate": 0.0009645479807998203, + "loss": 0.89497471, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.2454834, + "step": 764, + "time_per_iteration": 2.7865586280822754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149811, + "balance_loss_mlp": 1.12623131, + "epoch": 0.14717198922662564, + "flos": 517849003008.0, + "grad_norm": 0.07163260805176828, + "language_loss": 0.92376024, + "learning_rate": 0.0009644326702149196, + "loss": 0.93525833, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.23571777, + "step": 765, + "time_per_iteration": 2.729707717895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114598, + "balance_loss_mlp": 1.12176871, + "epoch": 0.1473643709118892, + "flos": 732024552960.0, + "grad_norm": 0.10016890685610987, + "language_loss": 0.84570462, + "learning_rate": 0.0009643171793212653, + "loss": 0.85716444, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.24206543, + "step": 766, + "time_per_iteration": 3.104130983352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147763, + "balance_loss_mlp": 1.12331319, + "epoch": 0.14755675259715276, + "flos": 620538900480.0, + "grad_norm": 0.0994818648660217, + "language_loss": 0.88828337, + "learning_rate": 0.0009642015081636952, + "loss": 0.89976102, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.24438477, + "step": 767, + "time_per_iteration": 2.6991779804229736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118291, + "balance_loss_mlp": 1.15871024, + "epoch": 0.14774913428241632, + "flos": 452219065344.0, + "grad_norm": 0.10983776315132832, + "language_loss": 0.87698913, + "learning_rate": 0.0009640856567871166, + "loss": 0.8888182, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.24182129, + "step": 768, + "time_per_iteration": 2.5240631103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212502, + "balance_loss_mlp": 1.18818331, + "epoch": 0.14794151596767988, + "flos": 837234869760.0, + "grad_norm": 0.07387168528771362, + "language_loss": 0.88451684, + "learning_rate": 0.0009639696252365072, + "loss": 0.89664185, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.24304199, + "step": 769, + "time_per_iteration": 3.0557117462158203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01239197, + "balance_loss_mlp": 1.21551013, + "epoch": 0.14813389765294344, + "flos": 686092114944.0, + "grad_norm": 0.09914913961127292, + "language_loss": 0.8159318, + "learning_rate": 0.0009638534135569144, + "loss": 0.82832372, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.23657227, + "step": 770, + "time_per_iteration": 2.9298524856567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01245135, + "balance_loss_mlp": 1.22161531, + "epoch": 0.148326279338207, + "flos": 509887996416.0, + "grad_norm": 0.09866814803314855, + "language_loss": 0.89646047, + "learning_rate": 0.0009637370217934554, + "loss": 0.90891182, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.23498535, + "step": 771, + "time_per_iteration": 2.682309865951538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221897, + "balance_loss_mlp": 1.19855595, + "epoch": 0.14851866102347056, + "flos": 588161129472.0, + "grad_norm": 0.06824551266768007, + "language_loss": 0.83023787, + "learning_rate": 0.0009636204499913175, + "loss": 0.84245688, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.23327637, + "step": 772, + "time_per_iteration": 2.883767604827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223775, + "balance_loss_mlp": 1.20097065, + "epoch": 0.14871104270873411, + "flos": 691026494976.0, + "grad_norm": 0.07043844896966983, + "language_loss": 0.87725186, + "learning_rate": 0.0009635036981957581, + "loss": 0.88948965, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.22802734, + "step": 773, + "time_per_iteration": 2.9000537395477295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187129, + "balance_loss_mlp": 1.16394269, + "epoch": 0.1489034243939977, + "flos": 655098264576.0, + "grad_norm": 0.15141860037933205, + "language_loss": 0.90646893, + "learning_rate": 0.0009633867664521043, + "loss": 0.91834021, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.23168945, + "step": 774, + "time_per_iteration": 2.8832309246063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169397, + "balance_loss_mlp": 1.14643705, + "epoch": 0.14909580607926126, + "flos": 475835212800.0, + "grad_norm": 0.08953509264873717, + "language_loss": 0.86451691, + "learning_rate": 0.0009632696548057527, + "loss": 0.87621093, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.22961426, + "step": 775, + "time_per_iteration": 2.5678458213806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114459, + "balance_loss_mlp": 1.12229764, + "epoch": 0.14928818776452482, + "flos": 611087887872.0, + "grad_norm": 0.10138358829207124, + "language_loss": 0.84634435, + "learning_rate": 0.0009631523633021704, + "loss": 0.85779023, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.22290039, + "step": 776, + "time_per_iteration": 2.8479549884796143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_mlp": 1.10418677, + "epoch": 0.14948056944978838, + "flos": 561772744704.0, + "grad_norm": 0.10363335088159256, + "language_loss": 0.88188493, + "learning_rate": 0.0009630348919868936, + "loss": 0.89315504, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.22814941, + "step": 777, + "time_per_iteration": 2.7757747173309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136966, + "balance_loss_mlp": 1.11441135, + "epoch": 0.14967295113505194, + "flos": 449199779328.0, + "grad_norm": 0.09986786801472973, + "language_loss": 0.81042939, + "learning_rate": 0.0009629172409055293, + "loss": 0.82179904, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.22558594, + "step": 778, + "time_per_iteration": 2.5126540660858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145221, + "balance_loss_mlp": 1.12336957, + "epoch": 0.1498653328203155, + "flos": 571285426176.0, + "grad_norm": 0.09261188529780942, + "language_loss": 0.87480628, + "learning_rate": 0.0009627994101037531, + "loss": 0.88625842, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.21875, + "step": 779, + "time_per_iteration": 2.7716262340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115587, + "balance_loss_mlp": 1.13354254, + "epoch": 0.15005771450557906, + "flos": 631215244800.0, + "grad_norm": 0.08443086809005321, + "language_loss": 0.88840389, + "learning_rate": 0.0009626813996273114, + "loss": 0.8999626, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.22338867, + "step": 780, + "time_per_iteration": 2.8740992546081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186209, + "balance_loss_mlp": 1.16370249, + "epoch": 0.15025009619084262, + "flos": 577939235328.0, + "grad_norm": 0.09833782575281567, + "language_loss": 0.88844621, + "learning_rate": 0.0009625632095220198, + "loss": 0.90030831, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.22497559, + "step": 781, + "time_per_iteration": 2.9050698280334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204209, + "balance_loss_mlp": 1.18169069, + "epoch": 0.1504424778761062, + "flos": 483887623680.0, + "grad_norm": 0.1242367807618526, + "language_loss": 0.87087309, + "learning_rate": 0.0009624448398337637, + "loss": 0.88291514, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.22509766, + "step": 782, + "time_per_iteration": 2.5470597743988037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227287, + "balance_loss_mlp": 1.20476806, + "epoch": 0.15063485956136977, + "flos": 762512196096.0, + "grad_norm": 0.08884420814610612, + "language_loss": 0.8877629, + "learning_rate": 0.0009623262906084984, + "loss": 0.90003586, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.22521973, + "step": 783, + "time_per_iteration": 3.0006895065307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01229008, + "balance_loss_mlp": 1.20682311, + "epoch": 0.15082724124663333, + "flos": 497630241792.0, + "grad_norm": 0.08808618298813263, + "language_loss": 0.8990804, + "learning_rate": 0.0009622075618922486, + "loss": 0.91137052, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.22192383, + "step": 784, + "time_per_iteration": 2.7111520767211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207095, + "balance_loss_mlp": 1.18568492, + "epoch": 0.15101962293189689, + "flos": 509725011456.0, + "grad_norm": 0.08652833198143661, + "language_loss": 0.87003136, + "learning_rate": 0.0009620886537311091, + "loss": 0.88210225, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.2142334, + "step": 785, + "time_per_iteration": 2.6401422023773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181477, + "balance_loss_mlp": 1.15950704, + "epoch": 0.15121200461716044, + "flos": 457756199424.0, + "grad_norm": 0.2899950143802249, + "language_loss": 0.85118186, + "learning_rate": 0.000961969566171244, + "loss": 0.8629967, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.21972656, + "step": 786, + "time_per_iteration": 2.526909351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196609, + "balance_loss_mlp": 1.17443573, + "epoch": 0.151404386302424, + "flos": 537986271744.0, + "grad_norm": 0.08121966250588863, + "language_loss": 0.90082663, + "learning_rate": 0.0009618502992588873, + "loss": 0.91279268, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.22167969, + "step": 787, + "time_per_iteration": 2.6575541496276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230806, + "balance_loss_mlp": 1.20764375, + "epoch": 0.15159676798768756, + "flos": 688209467904.0, + "grad_norm": 0.0715770490301525, + "language_loss": 0.87907356, + "learning_rate": 0.0009617308530403424, + "loss": 0.89138162, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.23168945, + "step": 788, + "time_per_iteration": 3.028930187225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01258523, + "balance_loss_mlp": 1.23478842, + "epoch": 0.15178914967295112, + "flos": 545319558144.0, + "grad_norm": 0.0802298351217653, + "language_loss": 0.87239158, + "learning_rate": 0.0009616112275619825, + "loss": 0.8849768, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.23718262, + "step": 789, + "time_per_iteration": 2.746056079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0132077, + "balance_loss_mlp": 1.29635596, + "epoch": 0.1519815313582147, + "flos": 511770783744.0, + "grad_norm": 0.12648829262821384, + "language_loss": 0.83592963, + "learning_rate": 0.0009614914228702503, + "loss": 0.84913737, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.24414062, + "step": 790, + "time_per_iteration": 2.6734559535980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308065, + "balance_loss_mlp": 1.28415179, + "epoch": 0.15217391304347827, + "flos": 684088187904.0, + "grad_norm": 0.09276885660597874, + "language_loss": 0.89010954, + "learning_rate": 0.0009613714390116581, + "loss": 0.9031902, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.23901367, + "step": 791, + "time_per_iteration": 2.983484983444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285002, + "balance_loss_mlp": 1.26071882, + "epoch": 0.15236629472874183, + "flos": 644186981376.0, + "grad_norm": 0.07985140077311874, + "language_loss": 0.85613286, + "learning_rate": 0.0009612512760327879, + "loss": 0.86898291, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.24291992, + "step": 792, + "time_per_iteration": 2.883850336074829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01244556, + "balance_loss_mlp": 1.21998703, + "epoch": 0.1525586764140054, + "flos": 412876196352.0, + "grad_norm": 0.09831690791880561, + "language_loss": 0.84491324, + "learning_rate": 0.0009611309339802909, + "loss": 0.85735881, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.24560547, + "step": 793, + "time_per_iteration": 2.4435439109802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207721, + "balance_loss_mlp": 1.1844871, + "epoch": 0.15275105809926895, + "flos": 802801414656.0, + "grad_norm": 0.0855298606279622, + "language_loss": 0.83781004, + "learning_rate": 0.0009610104129008881, + "loss": 0.84988725, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.23205566, + "step": 794, + "time_per_iteration": 3.13722825050354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196195, + "balance_loss_mlp": 1.17304444, + "epoch": 0.1529434397845325, + "flos": 612422249472.0, + "grad_norm": 0.09863604959388503, + "language_loss": 0.88015008, + "learning_rate": 0.0009608897128413701, + "loss": 0.89211196, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.23132324, + "step": 795, + "time_per_iteration": 2.746291160583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176373, + "balance_loss_mlp": 1.15306783, + "epoch": 0.15313582146979607, + "flos": 615246243840.0, + "grad_norm": 0.059228494387600535, + "language_loss": 0.85641718, + "learning_rate": 0.0009607688338485965, + "loss": 0.86818099, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.23278809, + "step": 796, + "time_per_iteration": 2.8617959022521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152933, + "balance_loss_mlp": 1.12994909, + "epoch": 0.15332820315505963, + "flos": 793602593280.0, + "grad_norm": 0.11279362274359876, + "language_loss": 0.90298712, + "learning_rate": 0.0009606477759694969, + "loss": 0.91451651, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.22998047, + "step": 797, + "time_per_iteration": 3.054548978805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147506, + "balance_loss_mlp": 1.12495136, + "epoch": 0.1535205848403232, + "flos": 550206950400.0, + "grad_norm": 0.1240450491743707, + "language_loss": 0.87260056, + "learning_rate": 0.0009605265392510703, + "loss": 0.88407564, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.22546387, + "step": 798, + "time_per_iteration": 2.660917282104492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164418, + "balance_loss_mlp": 1.14092219, + "epoch": 0.15371296652558677, + "flos": 535947840000.0, + "grad_norm": 0.07786555450456673, + "language_loss": 0.91656721, + "learning_rate": 0.0009604051237403846, + "loss": 0.92821133, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.23474121, + "step": 799, + "time_per_iteration": 2.6837708950042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01189347, + "balance_loss_mlp": 1.16668534, + "epoch": 0.15390534821085033, + "flos": 395219699712.0, + "grad_norm": 0.09844042951466975, + "language_loss": 0.85933173, + "learning_rate": 0.0009602835294845776, + "loss": 0.87122524, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.2265625, + "step": 800, + "time_per_iteration": 2.4643006324768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201717, + "balance_loss_mlp": 1.17804241, + "epoch": 0.1540977298961139, + "flos": 535846523904.0, + "grad_norm": 0.08383413994751185, + "language_loss": 0.90000272, + "learning_rate": 0.0009601617565308565, + "loss": 0.91201991, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.23681641, + "step": 801, + "time_per_iteration": 2.6335196495056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211967, + "balance_loss_mlp": 1.18856657, + "epoch": 0.15429011158137745, + "flos": 723727664640.0, + "grad_norm": 0.11945007862770202, + "language_loss": 0.86351627, + "learning_rate": 0.0009600398049264977, + "loss": 0.87563592, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.23413086, + "step": 802, + "time_per_iteration": 3.0110597610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188433, + "balance_loss_mlp": 1.16469824, + "epoch": 0.154482493266641, + "flos": 620516505600.0, + "grad_norm": 0.08697800210878956, + "language_loss": 0.9162643, + "learning_rate": 0.0009599176747188469, + "loss": 0.92814863, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.23718262, + "step": 803, + "time_per_iteration": 2.828881025314331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169648, + "balance_loss_mlp": 1.14716554, + "epoch": 0.15467487495190457, + "flos": 525624629760.0, + "grad_norm": 0.09755966571473051, + "language_loss": 0.82901067, + "learning_rate": 0.0009597953659553196, + "loss": 0.84070712, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.22485352, + "step": 804, + "time_per_iteration": 2.744241952896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163065, + "balance_loss_mlp": 1.14110649, + "epoch": 0.15486725663716813, + "flos": 527729872896.0, + "grad_norm": 0.08461871579014175, + "language_loss": 0.8877238, + "learning_rate": 0.0009596728786833997, + "loss": 0.89935452, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.21960449, + "step": 805, + "time_per_iteration": 2.637615203857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153079, + "balance_loss_mlp": 1.13075089, + "epoch": 0.1550596383224317, + "flos": 1048549349376.0, + "grad_norm": 0.07567223700797457, + "language_loss": 0.89263672, + "learning_rate": 0.0009595502129506415, + "loss": 0.90416753, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.2232666, + "step": 806, + "time_per_iteration": 3.381657838821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157381, + "balance_loss_mlp": 1.13502955, + "epoch": 0.15525202000769528, + "flos": 613716963840.0, + "grad_norm": 0.08260081287011234, + "language_loss": 0.82411599, + "learning_rate": 0.0009594273688046678, + "loss": 0.8356899, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.22351074, + "step": 807, + "time_per_iteration": 2.7444403171539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135063, + "balance_loss_mlp": 1.11292577, + "epoch": 0.15544440169295884, + "flos": 533064374784.0, + "grad_norm": 0.12637371348528909, + "language_loss": 0.85436296, + "learning_rate": 0.000959304346293171, + "loss": 0.8657136, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.22155762, + "step": 808, + "time_per_iteration": 2.630800485610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138798, + "balance_loss_mlp": 1.11732841, + "epoch": 0.1556367833782224, + "flos": 644723297280.0, + "grad_norm": 0.1222248699411619, + "language_loss": 0.87775064, + "learning_rate": 0.0009591811454639125, + "loss": 0.8891387, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.21484375, + "step": 809, + "time_per_iteration": 2.7841880321502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140586, + "balance_loss_mlp": 1.11836529, + "epoch": 0.15582916506348596, + "flos": 543822211584.0, + "grad_norm": 0.0775612296994351, + "language_loss": 0.87793982, + "learning_rate": 0.0009590577663647234, + "loss": 0.88934565, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.22216797, + "step": 810, + "time_per_iteration": 2.7182021141052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171751, + "balance_loss_mlp": 1.14905357, + "epoch": 0.15602154674874952, + "flos": 580034566656.0, + "grad_norm": 0.0958777530802899, + "language_loss": 0.85768712, + "learning_rate": 0.0009589342090435036, + "loss": 0.86940467, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.22692871, + "step": 811, + "time_per_iteration": 2.794064521789551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186257, + "balance_loss_mlp": 1.16242695, + "epoch": 0.15621392843401308, + "flos": 535248539136.0, + "grad_norm": 0.07937656910484732, + "language_loss": 0.86963636, + "learning_rate": 0.0009588104735482223, + "loss": 0.88149893, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.23803711, + "step": 812, + "time_per_iteration": 2.7221293449401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208738, + "balance_loss_mlp": 1.18419302, + "epoch": 0.15640631011927664, + "flos": 550903680000.0, + "grad_norm": 0.1117655096069856, + "language_loss": 0.83743179, + "learning_rate": 0.0009586865599269177, + "loss": 0.84951913, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.24536133, + "step": 813, + "time_per_iteration": 2.690633773803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01238315, + "balance_loss_mlp": 1.21402001, + "epoch": 0.1565986918045402, + "flos": 637478843904.0, + "grad_norm": 0.10590050341373854, + "language_loss": 0.8774755, + "learning_rate": 0.0009585624682276977, + "loss": 0.88985866, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.24291992, + "step": 814, + "time_per_iteration": 2.756228446960449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269022, + "balance_loss_mlp": 1.24407113, + "epoch": 0.15679107348980378, + "flos": 490810876416.0, + "grad_norm": 0.10996897761132594, + "language_loss": 0.87169892, + "learning_rate": 0.0009584381984987386, + "loss": 0.88438916, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.24938965, + "step": 815, + "time_per_iteration": 2.554874897003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01264413, + "balance_loss_mlp": 1.23899746, + "epoch": 0.15698345517506734, + "flos": 529951113216.0, + "grad_norm": 0.08063052755401852, + "language_loss": 0.89821672, + "learning_rate": 0.0009583137507882864, + "loss": 0.91086084, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.25415039, + "step": 816, + "time_per_iteration": 2.667743444442749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249115, + "balance_loss_mlp": 1.22435474, + "epoch": 0.1571758368603309, + "flos": 546038682624.0, + "grad_norm": 0.09885575067946582, + "language_loss": 0.80580056, + "learning_rate": 0.000958189125144656, + "loss": 0.81829166, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.24768066, + "step": 817, + "time_per_iteration": 2.727062463760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234827, + "balance_loss_mlp": 1.21099687, + "epoch": 0.15736821854559446, + "flos": 565649547264.0, + "grad_norm": 0.07125848643416562, + "language_loss": 0.88058704, + "learning_rate": 0.0009580643216162313, + "loss": 0.89293534, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.23803711, + "step": 818, + "time_per_iteration": 2.7225098609924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207, + "balance_loss_mlp": 1.18336058, + "epoch": 0.15756060023085802, + "flos": 500956047360.0, + "grad_norm": 0.1140894572848919, + "language_loss": 0.79018641, + "learning_rate": 0.0009579393402514652, + "loss": 0.80225646, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.23608398, + "step": 819, + "time_per_iteration": 2.623739004135132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174289, + "balance_loss_mlp": 1.15172231, + "epoch": 0.15775298191612158, + "flos": 519264857088.0, + "grad_norm": 0.099553940880187, + "language_loss": 0.90219855, + "learning_rate": 0.0009578141810988801, + "loss": 0.9139415, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.22546387, + "step": 820, + "time_per_iteration": 2.6413519382476807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115754, + "balance_loss_mlp": 1.13443768, + "epoch": 0.15794536360138514, + "flos": 466129810944.0, + "grad_norm": 0.07166699024259414, + "language_loss": 0.90092921, + "learning_rate": 0.0009576888442070668, + "loss": 0.91250455, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.23095703, + "step": 821, + "time_per_iteration": 2.586008310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114947, + "balance_loss_mlp": 1.12679601, + "epoch": 0.1581377452866487, + "flos": 517162185216.0, + "grad_norm": 0.12314887338256089, + "language_loss": 0.91971326, + "learning_rate": 0.0009575633296246854, + "loss": 0.93120795, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.22668457, + "step": 822, + "time_per_iteration": 2.582914113998413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153752, + "balance_loss_mlp": 1.13104272, + "epoch": 0.15833012697191226, + "flos": 549784433664.0, + "grad_norm": 0.08930039023036396, + "language_loss": 0.83068377, + "learning_rate": 0.0009574376374004652, + "loss": 0.84222132, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.22692871, + "step": 823, + "time_per_iteration": 2.689706563949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174252, + "balance_loss_mlp": 1.15108991, + "epoch": 0.15852250865717585, + "flos": 487457906688.0, + "grad_norm": 0.08166713358237257, + "language_loss": 0.80265462, + "learning_rate": 0.000957311767583204, + "loss": 0.81439716, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.23156738, + "step": 824, + "time_per_iteration": 2.5872888565063477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134272, + "balance_loss_mlp": 1.11863208, + "epoch": 0.1587148903424394, + "flos": 1309770694656.0, + "grad_norm": 0.027722115426624477, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83205861, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.15625, + "step": 825, + "time_per_iteration": 4.749661445617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186754, + "balance_loss_mlp": 1.16332912, + "epoch": 0.15890727202770297, + "flos": 466873528320.0, + "grad_norm": 0.0939924469385621, + "language_loss": 0.91145539, + "learning_rate": 0.0009570594953650961, + "loss": 0.92332292, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.23425293, + "step": 826, + "time_per_iteration": 2.5129754543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211327, + "balance_loss_mlp": 1.1879499, + "epoch": 0.15909965371296653, + "flos": 777107188224.0, + "grad_norm": 0.08032409834180723, + "language_loss": 0.80093443, + "learning_rate": 0.00095693309306219, + "loss": 0.81304777, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.23364258, + "step": 827, + "time_per_iteration": 3.116727352142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203567, + "balance_loss_mlp": 1.17957044, + "epoch": 0.1592920353982301, + "flos": 1078273451520.0, + "grad_norm": 0.07716345894173686, + "language_loss": 0.87652111, + "learning_rate": 0.0009568065133621244, + "loss": 0.88855684, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.23986816, + "step": 828, + "time_per_iteration": 3.3514394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186554, + "balance_loss_mlp": 1.1635462, + "epoch": 0.15948441708349365, + "flos": 725622935040.0, + "grad_norm": 0.09010150887645839, + "language_loss": 0.84615266, + "learning_rate": 0.0009566797563140422, + "loss": 0.85801816, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.23022461, + "step": 829, + "time_per_iteration": 2.8772377967834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178682, + "balance_loss_mlp": 1.15541196, + "epoch": 0.1596767987687572, + "flos": 578771785728.0, + "grad_norm": 0.07629618570457763, + "language_loss": 0.87662935, + "learning_rate": 0.0009565528219671547, + "loss": 0.88841611, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.23266602, + "step": 830, + "time_per_iteration": 2.9242594242095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168807, + "balance_loss_mlp": 1.14639533, + "epoch": 0.15986918045402077, + "flos": 528987511296.0, + "grad_norm": 0.07916714158721186, + "language_loss": 0.84442008, + "learning_rate": 0.0009564257103707418, + "loss": 0.85610813, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.22424316, + "step": 831, + "time_per_iteration": 2.615751266479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115633, + "balance_loss_mlp": 1.13395441, + "epoch": 0.16006156213928435, + "flos": 574584067584.0, + "grad_norm": 0.07401424691307211, + "language_loss": 0.9042899, + "learning_rate": 0.0009562984215741533, + "loss": 0.91585314, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.22387695, + "step": 832, + "time_per_iteration": 2.666475296020508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143834, + "balance_loss_mlp": 1.12204242, + "epoch": 0.1602539438245479, + "flos": 515541127680.0, + "grad_norm": 0.07498028486943187, + "language_loss": 0.82129556, + "learning_rate": 0.0009561709556268065, + "loss": 0.83273387, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.21801758, + "step": 833, + "time_per_iteration": 2.757997512817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139242, + "balance_loss_mlp": 1.11768937, + "epoch": 0.16044632550981147, + "flos": 621015745536.0, + "grad_norm": 0.09759486121205484, + "language_loss": 0.94624776, + "learning_rate": 0.0009560433125781884, + "loss": 0.95764017, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.21569824, + "step": 834, + "time_per_iteration": 2.7897424697875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141895, + "balance_loss_mlp": 1.12007987, + "epoch": 0.16063870719507503, + "flos": 561078586368.0, + "grad_norm": 0.11927807309460302, + "language_loss": 0.92270857, + "learning_rate": 0.0009559154924778544, + "loss": 0.93412757, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.21838379, + "step": 835, + "time_per_iteration": 2.7300117015838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146944, + "balance_loss_mlp": 1.12510526, + "epoch": 0.1608310888803386, + "flos": 805133882880.0, + "grad_norm": 0.08296253434867956, + "language_loss": 0.85007012, + "learning_rate": 0.0009557874953754284, + "loss": 0.8615396, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.21862793, + "step": 836, + "time_per_iteration": 3.0692667961120605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171621, + "balance_loss_mlp": 1.15024722, + "epoch": 0.16102347056560215, + "flos": 600587011584.0, + "grad_norm": 0.08629072980134203, + "language_loss": 0.83071995, + "learning_rate": 0.0009556593213206038, + "loss": 0.84243613, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.21374512, + "step": 837, + "time_per_iteration": 2.762371778488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198448, + "balance_loss_mlp": 1.17696667, + "epoch": 0.1612158522508657, + "flos": 553510361088.0, + "grad_norm": 0.07520594985182873, + "language_loss": 0.8681106, + "learning_rate": 0.0009555309703631414, + "loss": 0.88009512, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.21484375, + "step": 838, + "time_per_iteration": 2.721184253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216338, + "balance_loss_mlp": 1.19352138, + "epoch": 0.16140823393612927, + "flos": 555963969024.0, + "grad_norm": 0.08529357587841585, + "language_loss": 0.87116075, + "learning_rate": 0.0009554024425528722, + "loss": 0.88332415, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.22802734, + "step": 839, + "time_per_iteration": 2.7104406356811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223488, + "balance_loss_mlp": 1.20211315, + "epoch": 0.16160061562139286, + "flos": 543871770624.0, + "grad_norm": 0.09500040264705899, + "language_loss": 0.88661861, + "learning_rate": 0.0009552737379396948, + "loss": 0.89885342, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.21386719, + "step": 840, + "time_per_iteration": 2.6247448921203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214716, + "balance_loss_mlp": 1.19292414, + "epoch": 0.16179299730665642, + "flos": 603873169920.0, + "grad_norm": 0.06615948862952871, + "language_loss": 0.87843263, + "learning_rate": 0.0009551448565735767, + "loss": 0.8905797, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.21826172, + "step": 841, + "time_per_iteration": 2.8262698650360107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211691, + "balance_loss_mlp": 1.19057953, + "epoch": 0.16198537899191998, + "flos": 787166097408.0, + "grad_norm": 0.09887794790206932, + "language_loss": 0.8426103, + "learning_rate": 0.0009550157985045543, + "loss": 0.85472721, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.21130371, + "step": 842, + "time_per_iteration": 3.0120604038238525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206189, + "balance_loss_mlp": 1.18486238, + "epoch": 0.16217776067718354, + "flos": 519805942272.0, + "grad_norm": 0.08797554821911514, + "language_loss": 0.88739967, + "learning_rate": 0.0009548865637827321, + "loss": 0.89946151, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.21337891, + "step": 843, + "time_per_iteration": 2.6481337547302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204978, + "balance_loss_mlp": 1.18372297, + "epoch": 0.1623701423624471, + "flos": 505262707200.0, + "grad_norm": 0.09077498619786414, + "language_loss": 0.89573538, + "learning_rate": 0.0009547571524582838, + "loss": 0.90778512, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.21264648, + "step": 844, + "time_per_iteration": 2.5942928791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183078, + "balance_loss_mlp": 1.16156065, + "epoch": 0.16256252404771065, + "flos": 497183132160.0, + "grad_norm": 0.0818153207807116, + "language_loss": 0.92094475, + "learning_rate": 0.0009546275645814512, + "loss": 0.93277556, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.21533203, + "step": 845, + "time_per_iteration": 2.6533596515655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183489, + "balance_loss_mlp": 1.16250849, + "epoch": 0.16275490573297421, + "flos": 502344737280.0, + "grad_norm": 0.09434310518898727, + "language_loss": 0.89099437, + "learning_rate": 0.0009544978002025446, + "loss": 0.90282923, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.20983887, + "step": 846, + "time_per_iteration": 2.595737934112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174812, + "balance_loss_mlp": 1.15389085, + "epoch": 0.16294728741823777, + "flos": 507231756288.0, + "grad_norm": 0.0786790126962769, + "language_loss": 0.86643338, + "learning_rate": 0.0009543678593719434, + "loss": 0.87818146, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.20922852, + "step": 847, + "time_per_iteration": 2.734328508377075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172317, + "balance_loss_mlp": 1.1513598, + "epoch": 0.16313966910350133, + "flos": 509685364224.0, + "grad_norm": 0.07855126038492752, + "language_loss": 0.87300336, + "learning_rate": 0.0009542377421400945, + "loss": 0.88472658, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.2097168, + "step": 848, + "time_per_iteration": 2.8172829151153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168862, + "balance_loss_mlp": 1.14789319, + "epoch": 0.16333205078876492, + "flos": 543980427264.0, + "grad_norm": 0.06818105137358721, + "language_loss": 0.83380383, + "learning_rate": 0.0009541074485575145, + "loss": 0.84549248, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.20983887, + "step": 849, + "time_per_iteration": 2.7554948329925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153029, + "balance_loss_mlp": 1.13229823, + "epoch": 0.16352443247402848, + "flos": 507723655680.0, + "grad_norm": 0.07075228162905194, + "language_loss": 0.91935623, + "learning_rate": 0.0009539769786747874, + "loss": 0.93088651, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.20739746, + "step": 850, + "time_per_iteration": 2.681631326675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150714, + "balance_loss_mlp": 1.13010252, + "epoch": 0.16371681415929204, + "flos": 542124804096.0, + "grad_norm": 0.07677284982742894, + "language_loss": 0.80944598, + "learning_rate": 0.0009538463325425665, + "loss": 0.82095313, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.20617676, + "step": 851, + "time_per_iteration": 2.735233783721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154047, + "balance_loss_mlp": 1.13384068, + "epoch": 0.1639091958445556, + "flos": 520752291840.0, + "grad_norm": 0.11739032058616317, + "language_loss": 0.85686159, + "learning_rate": 0.0009537155102115728, + "loss": 0.86840206, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.20202637, + "step": 852, + "time_per_iteration": 2.620140790939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130528, + "balance_loss_mlp": 1.11065602, + "epoch": 0.16410157752981916, + "flos": 547414889472.0, + "grad_norm": 0.10634949324965158, + "language_loss": 0.83208728, + "learning_rate": 0.0009535845117325961, + "loss": 0.84339261, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.1986084, + "step": 853, + "time_per_iteration": 2.664644241333008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137711, + "balance_loss_mlp": 1.11726654, + "epoch": 0.16429395921508272, + "flos": 582853791744.0, + "grad_norm": 0.07583670741084705, + "language_loss": 0.9317174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94309455, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.20446777, + "step": 854, + "time_per_iteration": 2.801784038543701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132055, + "balance_loss_mlp": 1.11093068, + "epoch": 0.16448634090034628, + "flos": 530934538752.0, + "grad_norm": 0.10901038327062007, + "language_loss": 0.88220453, + "learning_rate": 0.0009533219865341949, + "loss": 0.89352506, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.21130371, + "step": 855, + "time_per_iteration": 2.5974481105804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145642, + "balance_loss_mlp": 1.12525666, + "epoch": 0.16467872258560984, + "flos": 491890475520.0, + "grad_norm": 0.08694797679629615, + "language_loss": 0.86617303, + "learning_rate": 0.0009531904599166916, + "loss": 0.87762946, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.20385742, + "step": 856, + "time_per_iteration": 2.6515426635742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165234, + "balance_loss_mlp": 1.1438601, + "epoch": 0.16487110427087343, + "flos": 506263385088.0, + "grad_norm": 0.10972732987763288, + "language_loss": 0.84639692, + "learning_rate": 0.0009530587573550478, + "loss": 0.85804921, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.21374512, + "step": 857, + "time_per_iteration": 2.5966737270355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141097, + "balance_loss_mlp": 1.1243124, + "epoch": 0.16506348595613698, + "flos": 1432824712704.0, + "grad_norm": 0.04856663639913232, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75460482, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16796875, + "step": 858, + "time_per_iteration": 5.004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122785, + "balance_loss_mlp": 1.20751262, + "epoch": 0.16525586764140054, + "flos": 477129927168.0, + "grad_norm": 0.09065075677374754, + "language_loss": 0.89923048, + "learning_rate": 0.0009527948246039337, + "loss": 0.91150904, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.20336914, + "step": 859, + "time_per_iteration": 2.5762951374053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01250043, + "balance_loss_mlp": 1.22891951, + "epoch": 0.1654482493266641, + "flos": 881096942592.0, + "grad_norm": 0.10611361403402562, + "language_loss": 0.87094891, + "learning_rate": 0.000952662594516931, + "loss": 0.88344932, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.21130371, + "step": 860, + "time_per_iteration": 3.1250970363616943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01211235, + "balance_loss_mlp": 1.19042134, + "epoch": 0.16564063101192766, + "flos": 626841773568.0, + "grad_norm": 0.07567437441181586, + "language_loss": 0.86383927, + "learning_rate": 0.0009525301886907234, + "loss": 0.87595159, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.20812988, + "step": 861, + "time_per_iteration": 2.8821423053741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119851, + "balance_loss_mlp": 1.17725468, + "epoch": 0.16583301269719122, + "flos": 561518355456.0, + "grad_norm": 0.09117738037536942, + "language_loss": 0.87712085, + "learning_rate": 0.0009523976071767155, + "loss": 0.88910592, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.21252441, + "step": 862, + "time_per_iteration": 2.7509195804595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164214, + "balance_loss_mlp": 1.14342415, + "epoch": 0.16602539438245478, + "flos": 567803976192.0, + "grad_norm": 0.08626936460480303, + "language_loss": 0.87840152, + "learning_rate": 0.00095226485002638, + "loss": 0.89004362, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.20800781, + "step": 863, + "time_per_iteration": 2.835188150405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148836, + "balance_loss_mlp": 1.12823641, + "epoch": 0.16621777606771834, + "flos": 574875532800.0, + "grad_norm": 0.09501605355258884, + "language_loss": 0.88929522, + "learning_rate": 0.0009521319172912576, + "loss": 0.90078366, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.20605469, + "step": 864, + "time_per_iteration": 2.773681879043579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148606, + "balance_loss_mlp": 1.12822115, + "epoch": 0.16641015775298193, + "flos": 514552932864.0, + "grad_norm": 0.1262431233900787, + "language_loss": 0.94519138, + "learning_rate": 0.0009519988090229579, + "loss": 0.95667744, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.20385742, + "step": 865, + "time_per_iteration": 2.7055397033691406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134112, + "balance_loss_mlp": 1.11327457, + "epoch": 0.1666025394382455, + "flos": 621685310976.0, + "grad_norm": 0.10486068908473449, + "language_loss": 0.87655658, + "learning_rate": 0.0009518655252731576, + "loss": 0.88789773, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.20849609, + "step": 866, + "time_per_iteration": 2.774974822998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124242, + "balance_loss_mlp": 1.102844, + "epoch": 0.16679492112350905, + "flos": 548808348672.0, + "grad_norm": 0.09006182482731041, + "language_loss": 0.90070617, + "learning_rate": 0.0009517320660936022, + "loss": 0.91194862, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.2142334, + "step": 867, + "time_per_iteration": 2.7388041019439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126447, + "balance_loss_mlp": 1.1068728, + "epoch": 0.1669873028087726, + "flos": 665675864064.0, + "grad_norm": 0.09548967470897408, + "language_loss": 0.82877147, + "learning_rate": 0.0009515984315361051, + "loss": 0.84003592, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.19555664, + "step": 868, + "time_per_iteration": 2.822772264480591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113243, + "balance_loss_mlp": 1.11205709, + "epoch": 0.16717968449403617, + "flos": 538564432896.0, + "grad_norm": 0.10934486098426227, + "language_loss": 0.86598766, + "learning_rate": 0.000951464621652548, + "loss": 0.87731194, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.20373535, + "step": 869, + "time_per_iteration": 2.648505687713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159804, + "balance_loss_mlp": 1.13964605, + "epoch": 0.16737206617929973, + "flos": 530121438720.0, + "grad_norm": 0.11951376597850719, + "language_loss": 0.7861675, + "learning_rate": 0.0009513306364948804, + "loss": 0.79776561, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.20153809, + "step": 870, + "time_per_iteration": 2.781686305999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188864, + "balance_loss_mlp": 1.16833639, + "epoch": 0.1675644478645633, + "flos": 480774362112.0, + "grad_norm": 0.09165243347067362, + "language_loss": 0.88987041, + "learning_rate": 0.0009511964761151197, + "loss": 0.90175903, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.2052002, + "step": 871, + "time_per_iteration": 2.5691447257995605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0122616, + "balance_loss_mlp": 1.20546532, + "epoch": 0.16775682954982685, + "flos": 494556627456.0, + "grad_norm": 0.09901550717192838, + "language_loss": 0.90224719, + "learning_rate": 0.0009510621405653521, + "loss": 0.91450876, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.20690918, + "step": 872, + "time_per_iteration": 2.585707426071167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191078, + "balance_loss_mlp": 1.17049098, + "epoch": 0.1679492112350904, + "flos": 752035912704.0, + "grad_norm": 0.11167023861469132, + "language_loss": 0.83886391, + "learning_rate": 0.0009509276298977309, + "loss": 0.85077471, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.20581055, + "step": 873, + "time_per_iteration": 2.970672607421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177127, + "balance_loss_mlp": 1.15688562, + "epoch": 0.168141592920354, + "flos": 1135875571200.0, + "grad_norm": 0.09073459995989616, + "language_loss": 0.81845176, + "learning_rate": 0.0009507929441644778, + "loss": 0.83022296, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.20239258, + "step": 874, + "time_per_iteration": 3.5511813163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137436, + "balance_loss_mlp": 1.11694419, + "epoch": 0.16833397460561755, + "flos": 632401302528.0, + "grad_norm": 0.09068306382456774, + "language_loss": 0.85649496, + "learning_rate": 0.0009506580834178826, + "loss": 0.86786938, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.20495605, + "step": 875, + "time_per_iteration": 2.797485589981079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130428, + "balance_loss_mlp": 1.10986471, + "epoch": 0.1685263562908811, + "flos": 541445326848.0, + "grad_norm": 0.09154973704283995, + "language_loss": 0.91347295, + "learning_rate": 0.0009505230477103028, + "loss": 0.92477721, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.20568848, + "step": 876, + "time_per_iteration": 2.70495867729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145788, + "balance_loss_mlp": 1.12518883, + "epoch": 0.16871873797614467, + "flos": 619325678592.0, + "grad_norm": 0.10157591470828177, + "language_loss": 0.8152402, + "learning_rate": 0.0009503878370941641, + "loss": 0.82669806, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.20593262, + "step": 877, + "time_per_iteration": 2.735748052597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151178, + "balance_loss_mlp": 1.13084054, + "epoch": 0.16891111966140823, + "flos": 606344030208.0, + "grad_norm": 0.15099055549540594, + "language_loss": 0.88741207, + "learning_rate": 0.0009502524516219595, + "loss": 0.89892387, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.20336914, + "step": 878, + "time_per_iteration": 2.730163812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160201, + "balance_loss_mlp": 1.13942301, + "epoch": 0.1691035013466718, + "flos": 552326874624.0, + "grad_norm": 0.11693714010182361, + "language_loss": 0.9004457, + "learning_rate": 0.0009501168913462506, + "loss": 0.91204774, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.20788574, + "step": 879, + "time_per_iteration": 2.684805393218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088136, + "balance_loss_mlp": 1.07440281, + "epoch": 0.16929588303193535, + "flos": 1476294377472.0, + "grad_norm": 0.04309817230007909, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80210066, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.13769531, + "step": 880, + "time_per_iteration": 4.804383039474487 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166032, + "balance_loss_mlp": 1.14521825, + "epoch": 0.1694882647171989, + "flos": 926248587264.0, + "grad_norm": 0.08467938058221719, + "language_loss": 0.85053843, + "learning_rate": 0.0009498452465949042, + "loss": 0.86219883, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.20825195, + "step": 881, + "time_per_iteration": 3.276735305786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201291, + "balance_loss_mlp": 1.17981005, + "epoch": 0.1696806464024625, + "flos": 546093010944.0, + "grad_norm": 0.06992657838118156, + "language_loss": 0.91281927, + "learning_rate": 0.0009497091622247285, + "loss": 0.92483222, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.21484375, + "step": 882, + "time_per_iteration": 2.70647931098938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01200556, + "balance_loss_mlp": 1.17901504, + "epoch": 0.16987302808772606, + "flos": 529234560000.0, + "grad_norm": 0.0696336676613267, + "language_loss": 0.93377209, + "learning_rate": 0.0009495729032619723, + "loss": 0.94577771, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.2154541, + "step": 883, + "time_per_iteration": 2.7534360885620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01227373, + "balance_loss_mlp": 1.20546222, + "epoch": 0.17006540977298962, + "flos": 755178909696.0, + "grad_norm": 0.08705372199297186, + "language_loss": 0.83726418, + "learning_rate": 0.0009494364697595354, + "loss": 0.84953797, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.21923828, + "step": 884, + "time_per_iteration": 2.9550111293792725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01242229, + "balance_loss_mlp": 1.22078347, + "epoch": 0.17025779145825318, + "flos": 558800446464.0, + "grad_norm": 0.08532836159387652, + "language_loss": 0.89805126, + "learning_rate": 0.0009492998617703867, + "loss": 0.91047359, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.21472168, + "step": 885, + "time_per_iteration": 2.710296154022217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216994, + "balance_loss_mlp": 1.19604921, + "epoch": 0.17045017314351674, + "flos": 512213124096.0, + "grad_norm": 0.08218136336540412, + "language_loss": 0.87561512, + "learning_rate": 0.0009491630793475619, + "loss": 0.88778508, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.20959473, + "step": 886, + "time_per_iteration": 2.6574454307556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223448, + "balance_loss_mlp": 1.20190716, + "epoch": 0.1706425548287803, + "flos": 508941646848.0, + "grad_norm": 0.06673733954054763, + "language_loss": 0.85054195, + "learning_rate": 0.0009490261225441643, + "loss": 0.8627764, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.2154541, + "step": 887, + "time_per_iteration": 2.9003562927246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209094, + "balance_loss_mlp": 1.18812537, + "epoch": 0.17083493651404386, + "flos": 717355408896.0, + "grad_norm": 0.07511336927499353, + "language_loss": 0.89910543, + "learning_rate": 0.0009488889914133656, + "loss": 0.91119635, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.2097168, + "step": 888, + "time_per_iteration": 2.9909205436706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121642, + "balance_loss_mlp": 1.19492674, + "epoch": 0.17102731819930742, + "flos": 559121647104.0, + "grad_norm": 0.07825003291748035, + "language_loss": 0.88796103, + "learning_rate": 0.0009487516860084047, + "loss": 0.90012527, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.21496582, + "step": 889, + "time_per_iteration": 2.7500009536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192765, + "balance_loss_mlp": 1.17159319, + "epoch": 0.17121969988457098, + "flos": 494786423808.0, + "grad_norm": 0.10600638107264272, + "language_loss": 0.88708925, + "learning_rate": 0.0009486142063825884, + "loss": 0.89901692, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.21179199, + "step": 890, + "time_per_iteration": 2.583644390106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212506, + "balance_loss_mlp": 1.19724751, + "epoch": 0.17141208156983456, + "flos": 1548889413120.0, + "grad_norm": 0.09034147523214399, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.73638725, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.15234375, + "step": 891, + "time_per_iteration": 4.9979774951934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175422, + "balance_loss_mlp": 1.1550256, + "epoch": 0.17160446325509812, + "flos": 619565386752.0, + "grad_norm": 0.1258166683219009, + "language_loss": 0.89561093, + "learning_rate": 0.0009483387246819542, + "loss": 0.9073652, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.20397949, + "step": 892, + "time_per_iteration": 2.7332327365875244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068395, + "balance_loss_mlp": 1.05304134, + "epoch": 0.17179684494036168, + "flos": 1381758206976.0, + "grad_norm": 0.03219618488122811, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83353972, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.15332031, + "step": 893, + "time_per_iteration": 4.691076993942261 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142338, + "balance_loss_mlp": 1.12172627, + "epoch": 0.17198922662562524, + "flos": 492636764160.0, + "grad_norm": 0.0974140714584663, + "language_loss": 0.88822401, + "learning_rate": 0.0009480625467392688, + "loss": 0.89964741, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.20617676, + "step": 894, + "time_per_iteration": 2.646313190460205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_mlp": 1.02080703, + "epoch": 0.1721816083108888, + "flos": 1458318878208.0, + "grad_norm": 0.032237767215918686, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79031026, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15527344, + "step": 895, + "time_per_iteration": 4.73791241645813 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134198, + "balance_loss_mlp": 1.11333644, + "epoch": 0.17237398999615236, + "flos": 528122654208.0, + "grad_norm": 0.07818472841262332, + "language_loss": 0.8733896, + "learning_rate": 0.0009477856729834196, + "loss": 0.88473153, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.20874023, + "step": 896, + "time_per_iteration": 2.7401630878448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132796, + "balance_loss_mlp": 1.11235166, + "epoch": 0.17256637168141592, + "flos": 603920157696.0, + "grad_norm": 0.07866740874279901, + "language_loss": 0.89730608, + "learning_rate": 0.0009476469753098809, + "loss": 0.90863407, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.20446777, + "step": 897, + "time_per_iteration": 2.7601003646850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141178, + "balance_loss_mlp": 1.12072182, + "epoch": 0.17275875336667948, + "flos": 509687935488.0, + "grad_norm": 0.08200394390051394, + "language_loss": 0.86714321, + "learning_rate": 0.0009475081038443738, + "loss": 0.878555, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.20458984, + "step": 898, + "time_per_iteration": 2.621018171310425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137375, + "balance_loss_mlp": 1.11602414, + "epoch": 0.17295113505194307, + "flos": 665260687872.0, + "grad_norm": 0.07995623076613839, + "language_loss": 0.85080326, + "learning_rate": 0.0009473690586408124, + "loss": 0.86217701, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.21374512, + "step": 899, + "time_per_iteration": 2.8553502559661865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149397, + "balance_loss_mlp": 1.12811828, + "epoch": 0.17314351673720663, + "flos": 555385807872.0, + "grad_norm": 0.08690536389731517, + "language_loss": 0.85954648, + "learning_rate": 0.0009472298397531792, + "loss": 0.87104046, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.21276855, + "step": 900, + "time_per_iteration": 2.7427260875701904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141965, + "balance_loss_mlp": 1.12017393, + "epoch": 0.17333589842247019, + "flos": 503609716224.0, + "grad_norm": 0.12119389218583115, + "language_loss": 0.86411273, + "learning_rate": 0.0009470904472355235, + "loss": 0.87553239, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.21801758, + "step": 901, + "time_per_iteration": 2.6585657596588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138192, + "balance_loss_mlp": 1.11563718, + "epoch": 0.17352828010773375, + "flos": 556208446464.0, + "grad_norm": 0.08947887393013387, + "language_loss": 0.79425454, + "learning_rate": 0.0009469508811419626, + "loss": 0.80563653, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.22570801, + "step": 902, + "time_per_iteration": 2.725372791290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207563, + "balance_loss_mlp": 1.1882031, + "epoch": 0.1737206617929973, + "flos": 1554525292032.0, + "grad_norm": 0.06736803575768126, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72821391, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.19335938, + "step": 903, + "time_per_iteration": 4.819333553314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138702, + "balance_loss_mlp": 1.1156832, + "epoch": 0.17391304347826086, + "flos": 516662945280.0, + "grad_norm": 0.10583475939723401, + "language_loss": 0.83563209, + "learning_rate": 0.0009466712284439292, + "loss": 0.84701914, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.23022461, + "step": 904, + "time_per_iteration": 2.7723944187164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136442, + "balance_loss_mlp": 1.11426902, + "epoch": 0.17410542516352442, + "flos": 541049974272.0, + "grad_norm": 0.09911822478323383, + "language_loss": 0.88385195, + "learning_rate": 0.0009465311419480276, + "loss": 0.89521635, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.22180176, + "step": 905, + "time_per_iteration": 2.708866596221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161825, + "balance_loss_mlp": 1.14012873, + "epoch": 0.17429780684878798, + "flos": 623849651712.0, + "grad_norm": 0.07480170707629828, + "language_loss": 0.88125765, + "learning_rate": 0.0009463908820933622, + "loss": 0.89287591, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.21704102, + "step": 906, + "time_per_iteration": 2.8386967182159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165754, + "balance_loss_mlp": 1.1450001, + "epoch": 0.17449018853405157, + "flos": 575663666688.0, + "grad_norm": 0.09057770875445449, + "language_loss": 0.82559198, + "learning_rate": 0.0009462504489343868, + "loss": 0.83724952, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.20751953, + "step": 907, + "time_per_iteration": 2.8287012577056885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01182859, + "balance_loss_mlp": 1.16167533, + "epoch": 0.17468257021931513, + "flos": 533753763840.0, + "grad_norm": 0.0967031701007891, + "language_loss": 0.88244259, + "learning_rate": 0.0009461098425256222, + "loss": 0.89427125, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.21203613, + "step": 908, + "time_per_iteration": 2.636411190032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184624, + "balance_loss_mlp": 1.16438186, + "epoch": 0.1748749519045787, + "flos": 540758509056.0, + "grad_norm": 0.08569423221876828, + "language_loss": 0.85917675, + "learning_rate": 0.0009459690629216567, + "loss": 0.87102294, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.20239258, + "step": 909, + "time_per_iteration": 2.6774063110351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185319, + "balance_loss_mlp": 1.16585207, + "epoch": 0.17506733358984225, + "flos": 498623579136.0, + "grad_norm": 0.06867787211129477, + "language_loss": 0.87373209, + "learning_rate": 0.0009458281101771457, + "loss": 0.88558531, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.19445801, + "step": 910, + "time_per_iteration": 2.6256136894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183744, + "balance_loss_mlp": 1.16421723, + "epoch": 0.1752597152751058, + "flos": 622923125760.0, + "grad_norm": 0.07423161751862324, + "language_loss": 0.82895565, + "learning_rate": 0.0009456869843468122, + "loss": 0.84079307, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.19519043, + "step": 911, + "time_per_iteration": 2.8429157733917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181199, + "balance_loss_mlp": 1.16098118, + "epoch": 0.17545209696036937, + "flos": 520972176384.0, + "grad_norm": 0.10560425483963332, + "language_loss": 0.78068089, + "learning_rate": 0.0009455456854854459, + "loss": 0.79249287, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.20214844, + "step": 912, + "time_per_iteration": 2.6220157146453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161811, + "balance_loss_mlp": 1.1425947, + "epoch": 0.17564447864563293, + "flos": 461988707328.0, + "grad_norm": 0.07427056945907796, + "language_loss": 0.84015787, + "learning_rate": 0.0009454042136479039, + "loss": 0.851776, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.19189453, + "step": 913, + "time_per_iteration": 2.5928330421447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170659, + "balance_loss_mlp": 1.15183616, + "epoch": 0.1758368603308965, + "flos": 480655793664.0, + "grad_norm": 0.08169247609196438, + "language_loss": 0.82556438, + "learning_rate": 0.0009452625688891103, + "loss": 0.83727098, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.18798828, + "step": 914, + "time_per_iteration": 2.5541818141937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215011, + "balance_loss_mlp": 1.20032406, + "epoch": 0.17602924201616005, + "flos": 1478942903808.0, + "grad_norm": 0.06355474766062214, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79949749, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.14648438, + "step": 915, + "time_per_iteration": 4.609099864959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151019, + "balance_loss_mlp": 1.13170671, + "epoch": 0.17622162370142364, + "flos": 602301671424.0, + "grad_norm": 0.08978748093655645, + "language_loss": 0.92478371, + "learning_rate": 0.0009449787608278015, + "loss": 0.9362939, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.19299316, + "step": 916, + "time_per_iteration": 2.8081016540527344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144026, + "balance_loss_mlp": 1.12480903, + "epoch": 0.1764140053866872, + "flos": 442699043328.0, + "grad_norm": 0.08892608358050076, + "language_loss": 0.9215048, + "learning_rate": 0.0009448365976354704, + "loss": 0.93294501, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.19213867, + "step": 917, + "time_per_iteration": 2.5476417541503906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141081, + "balance_loss_mlp": 1.12047005, + "epoch": 0.17660638707195075, + "flos": 500607682560.0, + "grad_norm": 0.10930526403118525, + "language_loss": 0.89404565, + "learning_rate": 0.0009446942617422558, + "loss": 0.90545642, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.20617676, + "step": 918, + "time_per_iteration": 2.6054670810699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159521, + "balance_loss_mlp": 1.13917232, + "epoch": 0.17679876875721431, + "flos": 538892974080.0, + "grad_norm": 0.08039502929266268, + "language_loss": 0.84809625, + "learning_rate": 0.0009445517532034176, + "loss": 0.85969138, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.20349121, + "step": 919, + "time_per_iteration": 2.736720561981201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116126, + "balance_loss_mlp": 1.14050603, + "epoch": 0.17699115044247787, + "flos": 497724217344.0, + "grad_norm": 0.09960932315337, + "language_loss": 0.88503635, + "learning_rate": 0.0009444090720742824, + "loss": 0.89664894, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.20751953, + "step": 920, + "time_per_iteration": 2.5981345176696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118497, + "balance_loss_mlp": 1.16263032, + "epoch": 0.17718353212774143, + "flos": 662738070528.0, + "grad_norm": 0.09080242050592086, + "language_loss": 0.87781966, + "learning_rate": 0.0009442662184102439, + "loss": 0.88966942, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.22351074, + "step": 921, + "time_per_iteration": 2.855386972427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195366, + "balance_loss_mlp": 1.17316878, + "epoch": 0.177375913813005, + "flos": 582641247744.0, + "grad_norm": 0.07657240030806824, + "language_loss": 0.86990869, + "learning_rate": 0.000944123192266763, + "loss": 0.88186234, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.22216797, + "step": 922, + "time_per_iteration": 2.862642526626587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184819, + "balance_loss_mlp": 1.16284895, + "epoch": 0.17756829549826855, + "flos": 552564011520.0, + "grad_norm": 0.09417779830391854, + "language_loss": 0.83500814, + "learning_rate": 0.0009439799936993671, + "loss": 0.8468563, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.21960449, + "step": 923, + "time_per_iteration": 2.7609872817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172194, + "balance_loss_mlp": 1.1505692, + "epoch": 0.17776067718353214, + "flos": 556322245632.0, + "grad_norm": 0.183012338078167, + "language_loss": 0.87992036, + "learning_rate": 0.0009438366227636511, + "loss": 0.89164221, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.21630859, + "step": 924, + "time_per_iteration": 2.680379867553711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147406, + "balance_loss_mlp": 1.12692571, + "epoch": 0.1779530588687957, + "flos": 658458574848.0, + "grad_norm": 0.07052119854018758, + "language_loss": 0.8590064, + "learning_rate": 0.0009436930795152763, + "loss": 0.87048048, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.20483398, + "step": 925, + "time_per_iteration": 2.84305477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134796, + "balance_loss_mlp": 1.11461377, + "epoch": 0.17814544055405926, + "flos": 644483589120.0, + "grad_norm": 0.10542322310235813, + "language_loss": 0.86425805, + "learning_rate": 0.0009435493640099713, + "loss": 0.875606, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.20178223, + "step": 926, + "time_per_iteration": 2.8326363563537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147089, + "balance_loss_mlp": 1.12663293, + "epoch": 0.17833782223932282, + "flos": 460913877504.0, + "grad_norm": 0.1030160256649362, + "language_loss": 0.83799899, + "learning_rate": 0.0009434054763035314, + "loss": 0.8494699, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.20458984, + "step": 927, + "time_per_iteration": 2.6224582195281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142086, + "balance_loss_mlp": 1.12232113, + "epoch": 0.17853020392458638, + "flos": 759539897856.0, + "grad_norm": 0.0964966031181637, + "language_loss": 0.85150439, + "learning_rate": 0.0009432614164518185, + "loss": 0.86292523, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.19750977, + "step": 928, + "time_per_iteration": 2.989607810974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115354, + "balance_loss_mlp": 1.13345337, + "epoch": 0.17872258560984994, + "flos": 782666717184.0, + "grad_norm": 0.11261525147662245, + "language_loss": 0.84222531, + "learning_rate": 0.000943117184510762, + "loss": 0.85376072, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.20080566, + "step": 929, + "time_per_iteration": 3.0107991695404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167369, + "balance_loss_mlp": 1.15220594, + "epoch": 0.1789149672951135, + "flos": 1459880464896.0, + "grad_norm": 0.0706795740425107, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.7995733, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.15136719, + "step": 930, + "time_per_iteration": 5.0069990158081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168853, + "balance_loss_mlp": 1.14890909, + "epoch": 0.17910734898037706, + "flos": 503864105472.0, + "grad_norm": 0.0722944763131068, + "language_loss": 0.885297, + "learning_rate": 0.0009428282045846674, + "loss": 0.89698553, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.19934082, + "step": 931, + "time_per_iteration": 2.705216884613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173254, + "balance_loss_mlp": 1.15314293, + "epoch": 0.17929973066564064, + "flos": 746249158656.0, + "grad_norm": 0.06808586729698768, + "language_loss": 0.89063865, + "learning_rate": 0.0009426834567118214, + "loss": 0.90237117, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.2010498, + "step": 932, + "time_per_iteration": 3.1137044429779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179521, + "balance_loss_mlp": 1.16003084, + "epoch": 0.1794921123509042, + "flos": 713214305280.0, + "grad_norm": 0.07690698304216284, + "language_loss": 0.80337363, + "learning_rate": 0.0009425385369740155, + "loss": 0.81516886, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.19470215, + "step": 933, + "time_per_iteration": 3.0430078506469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186692, + "balance_loss_mlp": 1.16659284, + "epoch": 0.17968449403616776, + "flos": 633142448640.0, + "grad_norm": 0.10248881334159239, + "language_loss": 0.86684513, + "learning_rate": 0.0009423934454275125, + "loss": 0.87871206, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.20092773, + "step": 934, + "time_per_iteration": 2.888127565383911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171583, + "balance_loss_mlp": 1.15185428, + "epoch": 0.17987687572143132, + "flos": 536323368960.0, + "grad_norm": 0.08181978587800019, + "language_loss": 0.91464841, + "learning_rate": 0.0009422481821286418, + "loss": 0.92636418, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.19714355, + "step": 935, + "time_per_iteration": 2.725064516067505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115528, + "balance_loss_mlp": 1.13605165, + "epoch": 0.18006925740669488, + "flos": 538077676032.0, + "grad_norm": 0.08977099192722084, + "language_loss": 0.87336344, + "learning_rate": 0.0009421027471337998, + "loss": 0.88491625, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.19213867, + "step": 936, + "time_per_iteration": 2.64992356300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153899, + "balance_loss_mlp": 1.13451552, + "epoch": 0.18026163909195844, + "flos": 539510782464.0, + "grad_norm": 0.08166389785278784, + "language_loss": 0.82045889, + "learning_rate": 0.0009419571404994493, + "loss": 0.83199793, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.19360352, + "step": 937, + "time_per_iteration": 2.6302027702331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140367, + "balance_loss_mlp": 1.12045932, + "epoch": 0.180454020777222, + "flos": 500642187264.0, + "grad_norm": 0.10573813889003272, + "language_loss": 0.9057107, + "learning_rate": 0.00094181136228212, + "loss": 0.91711438, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.19909668, + "step": 938, + "time_per_iteration": 2.6472811698913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146966, + "balance_loss_mlp": 1.12671292, + "epoch": 0.18064640246248556, + "flos": 498952120320.0, + "grad_norm": 0.10223057205117164, + "language_loss": 0.85864574, + "learning_rate": 0.0009416654125384077, + "loss": 0.8701154, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.20251465, + "step": 939, + "time_per_iteration": 2.7523345947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100615, + "balance_loss_mlp": 1.08507037, + "epoch": 0.18083878414774912, + "flos": 1519313988096.0, + "grad_norm": 0.03692949506691956, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80872989, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.15527344, + "step": 940, + "time_per_iteration": 4.95509147644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139639, + "balance_loss_mlp": 1.1185863, + "epoch": 0.1810311658330127, + "flos": 727337594880.0, + "grad_norm": 0.07658245982623446, + "language_loss": 0.83579218, + "learning_rate": 0.000941372998698552, + "loss": 0.84718859, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.21057129, + "step": 941, + "time_per_iteration": 3.022993326187134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152659, + "balance_loss_mlp": 1.13134432, + "epoch": 0.18122354751827627, + "flos": 564923082240.0, + "grad_norm": 0.08701506300356623, + "language_loss": 0.81278259, + "learning_rate": 0.0009412265347159336, + "loss": 0.82430923, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.21325684, + "step": 942, + "time_per_iteration": 2.7516462802886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136116, + "balance_loss_mlp": 1.11446774, + "epoch": 0.18141592920353983, + "flos": 519282109440.0, + "grad_norm": 0.09990043941217396, + "language_loss": 0.84286022, + "learning_rate": 0.0009410798994339829, + "loss": 0.85422134, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.21655273, + "step": 943, + "time_per_iteration": 2.619678258895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125702, + "balance_loss_mlp": 1.10438752, + "epoch": 0.1816083108888034, + "flos": 512470084608.0, + "grad_norm": 0.0907244307429491, + "language_loss": 0.87645197, + "learning_rate": 0.000940933092909628, + "loss": 0.88770896, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.21337891, + "step": 944, + "time_per_iteration": 2.5915796756744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137514, + "balance_loss_mlp": 1.11566281, + "epoch": 0.18180069257406695, + "flos": 492389715456.0, + "grad_norm": 0.07468252045243974, + "language_loss": 0.8361553, + "learning_rate": 0.0009407861151998649, + "loss": 0.84753042, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.21838379, + "step": 945, + "time_per_iteration": 2.597646713256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146698, + "balance_loss_mlp": 1.12490702, + "epoch": 0.1819930742593305, + "flos": 570158839296.0, + "grad_norm": 0.07893028842648955, + "language_loss": 0.85781825, + "learning_rate": 0.0009406389663617552, + "loss": 0.86928523, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.21789551, + "step": 946, + "time_per_iteration": 2.6909499168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157426, + "balance_loss_mlp": 1.1367197, + "epoch": 0.18218545594459407, + "flos": 605975841792.0, + "grad_norm": 0.0883302731715351, + "language_loss": 0.85250366, + "learning_rate": 0.000940491646452427, + "loss": 0.86407793, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.20703125, + "step": 947, + "time_per_iteration": 2.7548892498016357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188123, + "balance_loss_mlp": 1.16742826, + "epoch": 0.18237783762985763, + "flos": 548682439680.0, + "grad_norm": 0.09521178511918296, + "language_loss": 0.9039495, + "learning_rate": 0.000940344155529075, + "loss": 0.91583067, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.20690918, + "step": 948, + "time_per_iteration": 2.6882100105285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214568, + "balance_loss_mlp": 1.19396889, + "epoch": 0.1825702193151212, + "flos": 450741542400.0, + "grad_norm": 0.12174210826928723, + "language_loss": 0.86923814, + "learning_rate": 0.0009401964936489605, + "loss": 0.88138384, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.20605469, + "step": 949, + "time_per_iteration": 2.5339841842651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199663, + "balance_loss_mlp": 1.18013692, + "epoch": 0.18276260100038477, + "flos": 589245871104.0, + "grad_norm": 0.0789508013524053, + "language_loss": 0.85218668, + "learning_rate": 0.0009400486608694108, + "loss": 0.86418331, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.19506836, + "step": 950, + "time_per_iteration": 2.7437641620635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173826, + "balance_loss_mlp": 1.15394247, + "epoch": 0.18295498268564833, + "flos": 787331653632.0, + "grad_norm": 0.08777486633127113, + "language_loss": 0.87155032, + "learning_rate": 0.0009399006572478195, + "loss": 0.88328856, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.19873047, + "step": 951, + "time_per_iteration": 3.1146392822265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151812, + "balance_loss_mlp": 1.1324048, + "epoch": 0.1831473643709119, + "flos": 578147010048.0, + "grad_norm": 0.06965363368279433, + "language_loss": 0.90749818, + "learning_rate": 0.0009397524828416468, + "loss": 0.91901636, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.19384766, + "step": 952, + "time_per_iteration": 2.7005960941314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150163, + "balance_loss_mlp": 1.13092208, + "epoch": 0.18333974605617545, + "flos": 566889933312.0, + "grad_norm": 0.08371144384200242, + "language_loss": 0.95721734, + "learning_rate": 0.0009396041377084192, + "loss": 0.96871901, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.19226074, + "step": 953, + "time_per_iteration": 2.65962290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143055, + "balance_loss_mlp": 1.12399304, + "epoch": 0.183532127741439, + "flos": 526993496064.0, + "grad_norm": 0.07808709569264205, + "language_loss": 0.87208664, + "learning_rate": 0.0009394556219057295, + "loss": 0.88351727, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.19055176, + "step": 954, + "time_per_iteration": 2.7021074295043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146054, + "balance_loss_mlp": 1.12665915, + "epoch": 0.18372450942670257, + "flos": 594535956480.0, + "grad_norm": 0.0732836103686164, + "language_loss": 0.83296251, + "learning_rate": 0.0009393069354912362, + "loss": 0.84442306, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.19372559, + "step": 955, + "time_per_iteration": 2.7472946643829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146804, + "balance_loss_mlp": 1.12801623, + "epoch": 0.18391689111196613, + "flos": 645032014848.0, + "grad_norm": 0.07466806963668332, + "language_loss": 0.81601501, + "learning_rate": 0.0009391580785226649, + "loss": 0.827483, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.18798828, + "step": 956, + "time_per_iteration": 2.865922212600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084193, + "balance_loss_mlp": 1.07007885, + "epoch": 0.18410927279722972, + "flos": 1457073349632.0, + "grad_norm": 0.04640489893855834, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80424643, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.14160156, + "step": 957, + "time_per_iteration": 4.8100152015686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.13656831, + "epoch": 0.18430165448249328, + "flos": 658750040064.0, + "grad_norm": 0.08641924144795167, + "language_loss": 0.86033231, + "learning_rate": 0.0009388598531545196, + "loss": 0.87189424, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.19604492, + "step": 958, + "time_per_iteration": 2.879993438720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162855, + "balance_loss_mlp": 1.14316201, + "epoch": 0.18449403616775684, + "flos": 517933066752.0, + "grad_norm": 0.08295253694800603, + "language_loss": 0.85064113, + "learning_rate": 0.000938710484870727, + "loss": 0.8622697, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.19677734, + "step": 959, + "time_per_iteration": 2.6058270931243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169469, + "balance_loss_mlp": 1.14974046, + "epoch": 0.1846864178530204, + "flos": 552749391360.0, + "grad_norm": 0.0909196929102129, + "language_loss": 0.85416096, + "learning_rate": 0.0009385609462644189, + "loss": 0.86585563, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.19714355, + "step": 960, + "time_per_iteration": 4.22582483291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116162, + "balance_loss_mlp": 1.14138985, + "epoch": 0.18487879953828396, + "flos": 466166886912.0, + "grad_norm": 0.0839924836083711, + "language_loss": 0.8550421, + "learning_rate": 0.0009384112373936514, + "loss": 0.86665827, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.20227051, + "step": 961, + "time_per_iteration": 2.6566050052642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161461, + "balance_loss_mlp": 1.14142191, + "epoch": 0.18507118122354752, + "flos": 648496212480.0, + "grad_norm": 0.0943693164808434, + "language_loss": 0.90989888, + "learning_rate": 0.0009382613583165467, + "loss": 0.92151344, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.20031738, + "step": 962, + "time_per_iteration": 2.823707103729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115093, + "balance_loss_mlp": 1.13110566, + "epoch": 0.18526356290881107, + "flos": 626772764160.0, + "grad_norm": 0.07960710886198098, + "language_loss": 0.89083374, + "learning_rate": 0.0009381113090912928, + "loss": 0.90234309, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.19824219, + "step": 963, + "time_per_iteration": 2.760617733001709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113196, + "balance_loss_mlp": 1.11194444, + "epoch": 0.18545594459407463, + "flos": 432726769152.0, + "grad_norm": 0.09269195293936518, + "language_loss": 0.89102614, + "learning_rate": 0.000937961089776144, + "loss": 0.90234572, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.20007324, + "step": 964, + "time_per_iteration": 2.637064218521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137413, + "balance_loss_mlp": 1.11674166, + "epoch": 0.1856483262793382, + "flos": 749061043200.0, + "grad_norm": 0.09284731320409442, + "language_loss": 0.82889503, + "learning_rate": 0.0009378107004294208, + "loss": 0.84026921, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.20678711, + "step": 965, + "time_per_iteration": 2.9863977432250977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133143, + "balance_loss_mlp": 1.11312819, + "epoch": 0.18584070796460178, + "flos": 530326642176.0, + "grad_norm": 0.08496740626071231, + "language_loss": 0.90790451, + "learning_rate": 0.0009376601411095096, + "loss": 0.91923594, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.20007324, + "step": 966, + "time_per_iteration": 2.68448543548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118077, + "balance_loss_mlp": 1.09840786, + "epoch": 0.18603308964986534, + "flos": 483106830336.0, + "grad_norm": 0.07860547413279617, + "language_loss": 0.8636961, + "learning_rate": 0.0009375094118748622, + "loss": 0.87487686, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.1965332, + "step": 967, + "time_per_iteration": 2.6023223400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116788, + "balance_loss_mlp": 1.09746408, + "epoch": 0.1862254713351289, + "flos": 801316551168.0, + "grad_norm": 0.09121906518800267, + "language_loss": 0.90388292, + "learning_rate": 0.0009373585127839976, + "loss": 0.91505075, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.19299316, + "step": 968, + "time_per_iteration": 2.9992241859436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128327, + "balance_loss_mlp": 1.10974205, + "epoch": 0.18641785302039246, + "flos": 478323325440.0, + "grad_norm": 0.08500834593637788, + "language_loss": 0.90474886, + "learning_rate": 0.0009372074438954994, + "loss": 0.91603214, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.18579102, + "step": 969, + "time_per_iteration": 2.6900458335876465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129744, + "balance_loss_mlp": 1.11119485, + "epoch": 0.18661023470565602, + "flos": 388911684096.0, + "grad_norm": 0.07463307704621708, + "language_loss": 0.91465181, + "learning_rate": 0.0009370562052680181, + "loss": 0.92594928, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.18554688, + "step": 970, + "time_per_iteration": 2.4830586910247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118559, + "balance_loss_mlp": 1.10014117, + "epoch": 0.18680261639091958, + "flos": 564676033536.0, + "grad_norm": 0.0879562727670826, + "language_loss": 0.89281493, + "learning_rate": 0.0009369047969602695, + "loss": 0.90400052, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.18432617, + "step": 971, + "time_per_iteration": 2.745058298110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126638, + "balance_loss_mlp": 1.10707593, + "epoch": 0.18699499807618314, + "flos": 479259763200.0, + "grad_norm": 0.10844584745321367, + "language_loss": 0.862324, + "learning_rate": 0.0009367532190310357, + "loss": 0.87359041, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.19543457, + "step": 972, + "time_per_iteration": 2.6137964725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113488, + "balance_loss_mlp": 1.09404469, + "epoch": 0.1871873797614467, + "flos": 553283136000.0, + "grad_norm": 0.07658656218276177, + "language_loss": 0.88875228, + "learning_rate": 0.0009366014715391644, + "loss": 0.8998872, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.19433594, + "step": 973, + "time_per_iteration": 2.6654906272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112059, + "balance_loss_mlp": 1.09299731, + "epoch": 0.18737976144671029, + "flos": 552811060224.0, + "grad_norm": 0.11180851981284076, + "language_loss": 0.83713347, + "learning_rate": 0.0009364495545435693, + "loss": 0.84825402, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.19055176, + "step": 974, + "time_per_iteration": 2.801388740539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120051, + "balance_loss_mlp": 1.1004051, + "epoch": 0.18757214313197385, + "flos": 502250761728.0, + "grad_norm": 0.06978545014802194, + "language_loss": 0.87871438, + "learning_rate": 0.0009362974681032297, + "loss": 0.88991487, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.19628906, + "step": 975, + "time_per_iteration": 2.6227941513061523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124993, + "balance_loss_mlp": 1.10491848, + "epoch": 0.1877645248172374, + "flos": 675010506240.0, + "grad_norm": 0.08030171004504767, + "language_loss": 0.88050348, + "learning_rate": 0.0009361452122771907, + "loss": 0.89175344, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.20080566, + "step": 976, + "time_per_iteration": 2.899641752243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139865, + "balance_loss_mlp": 1.1185981, + "epoch": 0.18795690650250096, + "flos": 404989341696.0, + "grad_norm": 0.09158450212133555, + "language_loss": 0.82837689, + "learning_rate": 0.0009359927871245635, + "loss": 0.8397755, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.21289062, + "step": 977, + "time_per_iteration": 2.5095362663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.12616336, + "epoch": 0.18814928818776452, + "flos": 637891448832.0, + "grad_norm": 0.08436158367459867, + "language_loss": 0.86086357, + "learning_rate": 0.0009358401927045246, + "loss": 0.8723408, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.21569824, + "step": 978, + "time_per_iteration": 2.880329132080078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115214, + "balance_loss_mlp": 1.12937117, + "epoch": 0.18834166987302808, + "flos": 1138282191360.0, + "grad_norm": 0.14896441210102726, + "language_loss": 0.881185, + "learning_rate": 0.0009356874290763166, + "loss": 0.89270639, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.22753906, + "step": 979, + "time_per_iteration": 3.519901990890503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146434, + "balance_loss_mlp": 1.12485671, + "epoch": 0.18853405155829164, + "flos": 504793202688.0, + "grad_norm": 0.08194638070334626, + "language_loss": 0.88670301, + "learning_rate": 0.0009355344962992474, + "loss": 0.89816737, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.21606445, + "step": 980, + "time_per_iteration": 2.638364553451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137899, + "balance_loss_mlp": 1.11571455, + "epoch": 0.1887264332435552, + "flos": 608177258496.0, + "grad_norm": 0.07836652437453029, + "language_loss": 0.8762567, + "learning_rate": 0.0009353813944326908, + "loss": 0.88763571, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.22180176, + "step": 981, + "time_per_iteration": 2.963667869567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131587, + "balance_loss_mlp": 1.10924709, + "epoch": 0.1889188149288188, + "flos": 552529506816.0, + "grad_norm": 0.08486883897693408, + "language_loss": 0.82728517, + "learning_rate": 0.0009352281235360863, + "loss": 0.83860105, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.22338867, + "step": 982, + "time_per_iteration": 2.752194404602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146713, + "balance_loss_mlp": 1.12631679, + "epoch": 0.18911119661408235, + "flos": 418559063040.0, + "grad_norm": 0.08390803894001939, + "language_loss": 0.84704804, + "learning_rate": 0.0009350746836689389, + "loss": 0.85851514, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.20385742, + "step": 983, + "time_per_iteration": 2.572817325592041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114981, + "balance_loss_mlp": 1.13550532, + "epoch": 0.1893035782993459, + "flos": 1481974299648.0, + "grad_norm": 0.06256828552174507, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.8258903, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.14257812, + "step": 984, + "time_per_iteration": 5.0779805183410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126626, + "balance_loss_mlp": 1.10678935, + "epoch": 0.18949595998460947, + "flos": 508467373056.0, + "grad_norm": 0.08472556529064418, + "language_loss": 0.82448637, + "learning_rate": 0.0009347672972613634, + "loss": 0.83575261, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.19824219, + "step": 985, + "time_per_iteration": 2.615293502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113053, + "balance_loss_mlp": 1.11202836, + "epoch": 0.18968834166987303, + "flos": 531087611904.0, + "grad_norm": 0.06995806836739982, + "language_loss": 0.8510493, + "learning_rate": 0.0009346133508402735, + "loss": 0.86235464, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.18469238, + "step": 986, + "time_per_iteration": 2.729766845703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145502, + "balance_loss_mlp": 1.12719178, + "epoch": 0.1898807233551366, + "flos": 499762649088.0, + "grad_norm": 0.07783152768123536, + "language_loss": 0.83385336, + "learning_rate": 0.0009344592356873166, + "loss": 0.84530836, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.18322754, + "step": 987, + "time_per_iteration": 2.642298698425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142552, + "balance_loss_mlp": 1.12420571, + "epoch": 0.19007310504040015, + "flos": 602220178944.0, + "grad_norm": 0.1311760581731783, + "language_loss": 0.78159761, + "learning_rate": 0.0009343049518623255, + "loss": 0.79302317, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.18359375, + "step": 988, + "time_per_iteration": 2.7496607303619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147502, + "balance_loss_mlp": 1.12969208, + "epoch": 0.1902654867256637, + "flos": 601651929600.0, + "grad_norm": 0.07011475213003748, + "language_loss": 0.82941067, + "learning_rate": 0.0009341504994251985, + "loss": 0.8408857, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.17822266, + "step": 989, + "time_per_iteration": 2.850295305252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154172, + "balance_loss_mlp": 1.13986683, + "epoch": 0.19045786841092727, + "flos": 1575784005120.0, + "grad_norm": 0.061552691423840886, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74674672, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.14257812, + "step": 990, + "time_per_iteration": 5.020269393920898 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160468, + "balance_loss_mlp": 1.14208579, + "epoch": 0.19065025009619085, + "flos": 681634579968.0, + "grad_norm": 0.07610354532645859, + "language_loss": 0.81556082, + "learning_rate": 0.0009338410889544574, + "loss": 0.82716548, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.18383789, + "step": 991, + "time_per_iteration": 3.0640664100646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159788, + "balance_loss_mlp": 1.14151347, + "epoch": 0.1908426317814544, + "flos": 602264595456.0, + "grad_norm": 0.07533691574431517, + "language_loss": 0.87469906, + "learning_rate": 0.000933686131040967, + "loss": 0.88629693, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.18273926, + "step": 992, + "time_per_iteration": 2.8369646072387695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153048, + "balance_loss_mlp": 1.13516688, + "epoch": 0.19103501346671797, + "flos": 586308077568.0, + "grad_norm": 0.2292689794441624, + "language_loss": 0.90069616, + "learning_rate": 0.0009335310047555883, + "loss": 0.91222656, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.17895508, + "step": 993, + "time_per_iteration": 2.7662436962127686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201823, + "balance_loss_mlp": 1.18303561, + "epoch": 0.19122739515198153, + "flos": 545761898496.0, + "grad_norm": 0.08969446374481721, + "language_loss": 0.87941462, + "learning_rate": 0.0009333757101585467, + "loss": 0.89143288, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.18786621, + "step": 994, + "time_per_iteration": 2.6766159534454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248094, + "balance_loss_mlp": 1.22967577, + "epoch": 0.1914197768372451, + "flos": 521446450176.0, + "grad_norm": 0.09684982281817384, + "language_loss": 0.93064606, + "learning_rate": 0.0009332202473101329, + "loss": 0.94312704, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.1842041, + "step": 995, + "time_per_iteration": 2.6848959922790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124761, + "balance_loss_mlp": 1.22866774, + "epoch": 0.19161215852250865, + "flos": 611246103552.0, + "grad_norm": 0.14945399887744149, + "language_loss": 0.82354605, + "learning_rate": 0.0009330646162707028, + "loss": 0.83602214, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.18933105, + "step": 996, + "time_per_iteration": 2.7672605514526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0120105, + "balance_loss_mlp": 1.18239403, + "epoch": 0.1918045402077722, + "flos": 846660916224.0, + "grad_norm": 0.09345568382872575, + "language_loss": 0.83716351, + "learning_rate": 0.0009329088171006779, + "loss": 0.84917402, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.18664551, + "step": 997, + "time_per_iteration": 3.177269697189331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171291, + "balance_loss_mlp": 1.15201521, + "epoch": 0.19199692189303577, + "flos": 465937090560.0, + "grad_norm": 0.09261663839867938, + "language_loss": 0.85307527, + "learning_rate": 0.0009327528498605446, + "loss": 0.86478817, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.19274902, + "step": 998, + "time_per_iteration": 2.5818471908569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136553, + "balance_loss_mlp": 1.11700296, + "epoch": 0.19218930357829936, + "flos": 531576940032.0, + "grad_norm": 0.11232924304021881, + "language_loss": 0.89184988, + "learning_rate": 0.0009325967146108548, + "loss": 0.90321541, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.1953125, + "step": 999, + "time_per_iteration": 2.672342300415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141697, + "balance_loss_mlp": 1.12257588, + "epoch": 0.19238168526356292, + "flos": 601624765440.0, + "grad_norm": 0.11996696196806446, + "language_loss": 0.87541509, + "learning_rate": 0.0009324404114122258, + "loss": 0.88683212, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.19104004, + "step": 1000, + "time_per_iteration": 2.7652101516723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142189, + "balance_loss_mlp": 1.12290096, + "epoch": 0.19257406694882648, + "flos": 571982155776.0, + "grad_norm": 0.09563187877453348, + "language_loss": 0.86816871, + "learning_rate": 0.0009322839403253397, + "loss": 0.87959063, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.19274902, + "step": 1001, + "time_per_iteration": 2.7855865955352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113304, + "balance_loss_mlp": 1.11353719, + "epoch": 0.19276644863409004, + "flos": 801813219840.0, + "grad_norm": 0.0964526780140198, + "language_loss": 0.8374511, + "learning_rate": 0.0009321273014109439, + "loss": 0.84878153, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.19494629, + "step": 1002, + "time_per_iteration": 2.9773457050323486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137462, + "balance_loss_mlp": 1.11835289, + "epoch": 0.1929588303193536, + "flos": 563314507776.0, + "grad_norm": 0.07256968924395192, + "language_loss": 0.8405087, + "learning_rate": 0.0009319704947298513, + "loss": 0.85188329, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.19104004, + "step": 1003, + "time_per_iteration": 2.8997581005096436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144905, + "balance_loss_mlp": 1.12630868, + "epoch": 0.19315121200461716, + "flos": 626837004288.0, + "grad_norm": 0.15770574603346119, + "language_loss": 0.88051564, + "learning_rate": 0.0009318135203429393, + "loss": 0.89196467, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.18579102, + "step": 1004, + "time_per_iteration": 4.269490957260132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156184, + "balance_loss_mlp": 1.13703942, + "epoch": 0.19334359368988072, + "flos": 517451079168.0, + "grad_norm": 0.08756431218137971, + "language_loss": 0.87512451, + "learning_rate": 0.0009316563783111511, + "loss": 0.88668633, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.19128418, + "step": 1005, + "time_per_iteration": 2.741323471069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164816, + "balance_loss_mlp": 1.14583826, + "epoch": 0.19353597537514428, + "flos": 694080285696.0, + "grad_norm": 0.06803118553980413, + "language_loss": 0.81866097, + "learning_rate": 0.0009314990686954943, + "loss": 0.83030909, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.18969727, + "step": 1006, + "time_per_iteration": 2.955195903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198123, + "balance_loss_mlp": 1.1794908, + "epoch": 0.19372835706040784, + "flos": 1210170585600.0, + "grad_norm": 0.08085614110860996, + "language_loss": 0.80862725, + "learning_rate": 0.000931341591557042, + "loss": 0.8206085, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.18615723, + "step": 1007, + "time_per_iteration": 3.74294114112854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192787, + "balance_loss_mlp": 1.17408264, + "epoch": 0.19392073874567142, + "flos": 520631152128.0, + "grad_norm": 0.10092207476563657, + "language_loss": 0.87274837, + "learning_rate": 0.0009311839469569325, + "loss": 0.88467628, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.18701172, + "step": 1008, + "time_per_iteration": 2.7143359184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188299, + "balance_loss_mlp": 1.16947544, + "epoch": 0.19411312043093498, + "flos": 588816013824.0, + "grad_norm": 0.10252165229611418, + "language_loss": 0.86257041, + "learning_rate": 0.0009310261349563687, + "loss": 0.87445343, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.18823242, + "step": 1009, + "time_per_iteration": 2.7420098781585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156356, + "balance_loss_mlp": 1.13825965, + "epoch": 0.19430550211619854, + "flos": 579382253568.0, + "grad_norm": 0.06920598095285249, + "language_loss": 0.8520751, + "learning_rate": 0.0009308681556166186, + "loss": 0.86363864, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.18103027, + "step": 1010, + "time_per_iteration": 2.8593883514404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162311, + "balance_loss_mlp": 1.14391661, + "epoch": 0.1944978838014621, + "flos": 621126973440.0, + "grad_norm": 0.10589580567356643, + "language_loss": 0.87318867, + "learning_rate": 0.0009307100089990152, + "loss": 0.88481176, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.18408203, + "step": 1011, + "time_per_iteration": 2.7444002628326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144739, + "balance_loss_mlp": 1.12624931, + "epoch": 0.19469026548672566, + "flos": 598714136064.0, + "grad_norm": 0.10287575048528846, + "language_loss": 0.83773112, + "learning_rate": 0.0009305516951649568, + "loss": 0.84917855, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.18481445, + "step": 1012, + "time_per_iteration": 2.7355475425720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174282, + "balance_loss_mlp": 1.15630519, + "epoch": 0.19488264717198922, + "flos": 552161318400.0, + "grad_norm": 0.07063143065951083, + "language_loss": 0.86586678, + "learning_rate": 0.0009303932141759057, + "loss": 0.87760961, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.17980957, + "step": 1013, + "time_per_iteration": 2.778740882873535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166591, + "balance_loss_mlp": 1.14829278, + "epoch": 0.19507502885725278, + "flos": 666135456768.0, + "grad_norm": 0.09801085242945827, + "language_loss": 0.83495271, + "learning_rate": 0.0009302345660933902, + "loss": 0.84661865, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.18286133, + "step": 1014, + "time_per_iteration": 2.8084325790405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178039, + "balance_loss_mlp": 1.1603483, + "epoch": 0.19526741054251634, + "flos": 671081946624.0, + "grad_norm": 0.1010340318018862, + "language_loss": 0.84950441, + "learning_rate": 0.0009300757509790026, + "loss": 0.86128479, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.17712402, + "step": 1015, + "time_per_iteration": 2.9023685455322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179228, + "balance_loss_mlp": 1.16137052, + "epoch": 0.19545979222777993, + "flos": 447215675904.0, + "grad_norm": 0.1305336983537898, + "language_loss": 0.90272522, + "learning_rate": 0.0009299167688944005, + "loss": 0.91451752, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.17883301, + "step": 1016, + "time_per_iteration": 2.5396370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180846, + "balance_loss_mlp": 1.16236818, + "epoch": 0.1956521739130435, + "flos": 569084009472.0, + "grad_norm": 0.10642959866559894, + "language_loss": 0.85698497, + "learning_rate": 0.0009297576199013063, + "loss": 0.86879343, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.18457031, + "step": 1017, + "time_per_iteration": 2.7503206729888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151973, + "balance_loss_mlp": 1.13890779, + "epoch": 0.19584455559830705, + "flos": 1455749273088.0, + "grad_norm": 0.05607404145793752, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74154103, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.13085938, + "step": 1018, + "time_per_iteration": 4.931609153747559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106819, + "balance_loss_mlp": 1.09365869, + "epoch": 0.1960369372835706, + "flos": 1591150252032.0, + "grad_norm": 0.04672191734885249, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80533117, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.13183594, + "step": 1019, + "time_per_iteration": 5.336720705032349 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228797, + "balance_loss_mlp": 1.21011734, + "epoch": 0.19622931896883417, + "flos": 616017125376.0, + "grad_norm": 0.07997087287444872, + "language_loss": 0.86300683, + "learning_rate": 0.0009292791720892659, + "loss": 0.8752948, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.18664551, + "step": 1020, + "time_per_iteration": 2.8861892223358154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01221818, + "balance_loss_mlp": 1.20275593, + "epoch": 0.19642170065409773, + "flos": 466201391616.0, + "grad_norm": 0.08883950328468299, + "language_loss": 0.88082206, + "learning_rate": 0.0009291193560807218, + "loss": 0.89304024, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.19055176, + "step": 1021, + "time_per_iteration": 2.6382570266723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209623, + "balance_loss_mlp": 1.19078755, + "epoch": 0.19661408233936128, + "flos": 515289309696.0, + "grad_norm": 0.07890952504822618, + "language_loss": 0.86793423, + "learning_rate": 0.0009289593734732688, + "loss": 0.88003045, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.18811035, + "step": 1022, + "time_per_iteration": 2.6261141300201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185276, + "balance_loss_mlp": 1.16670358, + "epoch": 0.19680646402462484, + "flos": 392640182784.0, + "grad_norm": 0.0835325264325779, + "language_loss": 0.93570763, + "learning_rate": 0.0009287992243290175, + "loss": 0.94756043, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.18579102, + "step": 1023, + "time_per_iteration": 2.515672445297241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161758, + "balance_loss_mlp": 1.14213622, + "epoch": 0.19699884570988843, + "flos": 626421828096.0, + "grad_norm": 0.07747777445836627, + "language_loss": 0.9021076, + "learning_rate": 0.0009286389087101435, + "loss": 0.9137252, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.19604492, + "step": 1024, + "time_per_iteration": 2.8165409564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144138, + "balance_loss_mlp": 1.12458754, + "epoch": 0.197191227395152, + "flos": 557982577152.0, + "grad_norm": 0.093529298896635, + "language_loss": 0.88402045, + "learning_rate": 0.0009284784266788864, + "loss": 0.8954618, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.19543457, + "step": 1025, + "time_per_iteration": 2.746727705001831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143253, + "balance_loss_mlp": 1.12456095, + "epoch": 0.19738360908041555, + "flos": 664993815552.0, + "grad_norm": 0.07377516343298976, + "language_loss": 0.92142463, + "learning_rate": 0.0009283177782975512, + "loss": 0.9328571, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.18688965, + "step": 1026, + "time_per_iteration": 3.0783705711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125598, + "balance_loss_mlp": 1.1064887, + "epoch": 0.1975759907656791, + "flos": 522496687104.0, + "grad_norm": 0.09283572483169282, + "language_loss": 0.87607288, + "learning_rate": 0.000928156963628507, + "loss": 0.8873288, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.19116211, + "step": 1027, + "time_per_iteration": 2.6074790954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119339, + "balance_loss_mlp": 1.09947884, + "epoch": 0.19776837245094267, + "flos": 462482804736.0, + "grad_norm": 0.17318139898935403, + "language_loss": 0.87847698, + "learning_rate": 0.0009279959827341877, + "loss": 0.88967031, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.1986084, + "step": 1028, + "time_per_iteration": 2.786883592605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122475, + "balance_loss_mlp": 1.10186362, + "epoch": 0.19796075413620623, + "flos": 503058719232.0, + "grad_norm": 0.09725837933244906, + "language_loss": 0.87463772, + "learning_rate": 0.0009278348356770915, + "loss": 0.88586247, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.20617676, + "step": 1029, + "time_per_iteration": 2.6152124404907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115903, + "balance_loss_mlp": 1.09576869, + "epoch": 0.1981531358214698, + "flos": 507538275840.0, + "grad_norm": 0.09726879406227856, + "language_loss": 0.85104239, + "learning_rate": 0.0009276735225197814, + "loss": 0.86220145, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.20129395, + "step": 1030, + "time_per_iteration": 2.6491973400115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140863, + "balance_loss_mlp": 1.12079978, + "epoch": 0.19834551750673335, + "flos": 531547204608.0, + "grad_norm": 0.07981294302307375, + "language_loss": 0.85465813, + "learning_rate": 0.0009275120433248847, + "loss": 0.86606669, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.20056152, + "step": 1031, + "time_per_iteration": 2.7181904315948486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170044, + "balance_loss_mlp": 1.14986157, + "epoch": 0.1985378991919969, + "flos": 775511096832.0, + "grad_norm": 0.08870117223998657, + "language_loss": 0.85574758, + "learning_rate": 0.0009273503981550931, + "loss": 0.86744803, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.20178223, + "step": 1032, + "time_per_iteration": 3.15751576423645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01210574, + "balance_loss_mlp": 1.19066548, + "epoch": 0.1987302808772605, + "flos": 434288355840.0, + "grad_norm": 0.10622365116136065, + "language_loss": 0.86958814, + "learning_rate": 0.0009271885870731626, + "loss": 0.88169384, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.19909668, + "step": 1033, + "time_per_iteration": 2.513871431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0124371, + "balance_loss_mlp": 1.22355127, + "epoch": 0.19892266256252406, + "flos": 553604336640.0, + "grad_norm": 0.12163862472720371, + "language_loss": 0.88120484, + "learning_rate": 0.0009270266101419143, + "loss": 0.89364195, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.20153809, + "step": 1034, + "time_per_iteration": 2.6154308319091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233971, + "balance_loss_mlp": 1.21453989, + "epoch": 0.19911504424778761, + "flos": 549865926144.0, + "grad_norm": 0.13626001105869123, + "language_loss": 0.84950191, + "learning_rate": 0.0009268644674242328, + "loss": 0.86184162, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.19433594, + "step": 1035, + "time_per_iteration": 2.706982135772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220957, + "balance_loss_mlp": 1.20152593, + "epoch": 0.19930742593305117, + "flos": 518281431552.0, + "grad_norm": 0.09310216058180905, + "language_loss": 0.80796313, + "learning_rate": 0.0009267021589830678, + "loss": 0.82017273, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.19421387, + "step": 1036, + "time_per_iteration": 2.641144275665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300787, + "balance_loss_mlp": 1.28457427, + "epoch": 0.19949980761831473, + "flos": 1509338769408.0, + "grad_norm": 0.08257719551105532, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78927869, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 5.017476558685303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198691, + "balance_loss_mlp": 1.17903364, + "epoch": 0.1996921893035783, + "flos": 698129985024.0, + "grad_norm": 0.08600893320147879, + "language_loss": 0.92715919, + "learning_rate": 0.000926377045182406, + "loss": 0.93914616, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.19641113, + "step": 1038, + "time_per_iteration": 2.939668893814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215159, + "balance_loss_mlp": 1.19595408, + "epoch": 0.19988457098884185, + "flos": 727023734784.0, + "grad_norm": 0.24386100452943713, + "language_loss": 0.87511599, + "learning_rate": 0.0009262142399491296, + "loss": 0.88726759, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.19189453, + "step": 1039, + "time_per_iteration": 3.0862977504730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248948, + "balance_loss_mlp": 1.22932601, + "epoch": 0.2000769526741054, + "flos": 560544841728.0, + "grad_norm": 0.09408226392225982, + "language_loss": 0.87996912, + "learning_rate": 0.0009260512692448105, + "loss": 0.89245868, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.19604492, + "step": 1040, + "time_per_iteration": 2.711160182952881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01288003, + "balance_loss_mlp": 1.26749945, + "epoch": 0.200269334359369, + "flos": 572039055360.0, + "grad_norm": 0.13301921079143278, + "language_loss": 0.84115559, + "learning_rate": 0.000925888133132719, + "loss": 0.85403562, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.20507812, + "step": 1041, + "time_per_iteration": 2.740140199661255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166251, + "balance_loss_mlp": 1.1515646, + "epoch": 0.20046171604463256, + "flos": 1486118347776.0, + "grad_norm": 0.059408002972858115, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.8077668, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14648438, + "step": 1042, + "time_per_iteration": 4.983680009841919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01318672, + "balance_loss_mlp": 1.29690433, + "epoch": 0.20065409772989612, + "flos": 496528247808.0, + "grad_norm": 0.1163225797864763, + "language_loss": 0.81054026, + "learning_rate": 0.0009255613649386244, + "loss": 0.82372701, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.21777344, + "step": 1043, + "time_per_iteration": 2.6790683269500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300915, + "balance_loss_mlp": 1.27936232, + "epoch": 0.20084647941515968, + "flos": 579367572480.0, + "grad_norm": 0.10848871275509671, + "language_loss": 0.78969169, + "learning_rate": 0.0009253977329834838, + "loss": 0.80270082, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.21569824, + "step": 1044, + "time_per_iteration": 2.6970701217651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286746, + "balance_loss_mlp": 1.26458514, + "epoch": 0.20103886110042324, + "flos": 642076968960.0, + "grad_norm": 0.09565462118383694, + "language_loss": 0.86161876, + "learning_rate": 0.0009252339358742965, + "loss": 0.87448621, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.22167969, + "step": 1045, + "time_per_iteration": 2.87453556060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129502, + "balance_loss_mlp": 1.2733593, + "epoch": 0.2012312427856868, + "flos": 441970007040.0, + "grad_norm": 0.10796199739740596, + "language_loss": 0.83195245, + "learning_rate": 0.000925069973674654, + "loss": 0.84490263, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.21679688, + "step": 1046, + "time_per_iteration": 2.6612823009490967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275399, + "balance_loss_mlp": 1.25408411, + "epoch": 0.20142362447095036, + "flos": 554402382336.0, + "grad_norm": 0.06722367899146847, + "language_loss": 0.88250053, + "learning_rate": 0.000924905846448212, + "loss": 0.89525455, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.21325684, + "step": 1047, + "time_per_iteration": 2.730875015258789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01292917, + "balance_loss_mlp": 1.27123272, + "epoch": 0.20161600615621392, + "flos": 670301153280.0, + "grad_norm": 0.09038052031526789, + "language_loss": 0.85797572, + "learning_rate": 0.0009247415542586906, + "loss": 0.87090492, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.21691895, + "step": 1048, + "time_per_iteration": 2.8412506580352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248583, + "balance_loss_mlp": 1.22672033, + "epoch": 0.2018083878414775, + "flos": 573091490304.0, + "grad_norm": 0.08064336148566398, + "language_loss": 0.83021247, + "learning_rate": 0.0009245770971698735, + "loss": 0.84269828, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.21875, + "step": 1049, + "time_per_iteration": 4.440186023712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01237632, + "balance_loss_mlp": 1.21671033, + "epoch": 0.20200076952674106, + "flos": 425857844736.0, + "grad_norm": 0.08794152426297831, + "language_loss": 0.88490599, + "learning_rate": 0.0009244124752456087, + "loss": 0.89728236, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.20922852, + "step": 1050, + "time_per_iteration": 2.529827833175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01224991, + "balance_loss_mlp": 1.20434391, + "epoch": 0.20219315121200462, + "flos": 536597581824.0, + "grad_norm": 0.07833105787078826, + "language_loss": 0.85121548, + "learning_rate": 0.0009242476885498081, + "loss": 0.86346543, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.20654297, + "step": 1051, + "time_per_iteration": 2.7487235069274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201681, + "balance_loss_mlp": 1.18077159, + "epoch": 0.20238553289726818, + "flos": 477873644544.0, + "grad_norm": 0.09537947845979083, + "language_loss": 0.80832058, + "learning_rate": 0.0009240827371464474, + "loss": 0.82033736, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.20922852, + "step": 1052, + "time_per_iteration": 2.570289373397827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190217, + "balance_loss_mlp": 1.16978419, + "epoch": 0.20257791458253174, + "flos": 1152057116160.0, + "grad_norm": 0.0749559041873476, + "language_loss": 0.83869404, + "learning_rate": 0.0009239176210995666, + "loss": 0.85059625, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.2043457, + "step": 1053, + "time_per_iteration": 3.48331880569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164732, + "balance_loss_mlp": 1.14463329, + "epoch": 0.2027702962677953, + "flos": 666913678848.0, + "grad_norm": 0.08759256892165929, + "language_loss": 0.9366219, + "learning_rate": 0.0009237523404732695, + "loss": 0.94826925, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.2010498, + "step": 1054, + "time_per_iteration": 2.8900768756866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152229, + "balance_loss_mlp": 1.13102162, + "epoch": 0.20296267795305886, + "flos": 641298746880.0, + "grad_norm": 0.08554891996887364, + "language_loss": 0.84106672, + "learning_rate": 0.0009235868953317235, + "loss": 0.85258889, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.21191406, + "step": 1055, + "time_per_iteration": 2.805739402770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152429, + "balance_loss_mlp": 1.1321516, + "epoch": 0.20315505963832242, + "flos": 930575070720.0, + "grad_norm": 0.08283401132516657, + "language_loss": 0.84830916, + "learning_rate": 0.0009234212857391602, + "loss": 0.85983348, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.20275879, + "step": 1056, + "time_per_iteration": 3.2523794174194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150661, + "balance_loss_mlp": 1.13000214, + "epoch": 0.20334744132358598, + "flos": 562111197696.0, + "grad_norm": 0.08956025084292601, + "language_loss": 0.88911903, + "learning_rate": 0.000923255511759875, + "loss": 0.90062559, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.20666504, + "step": 1057, + "time_per_iteration": 2.7904763221740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144001, + "balance_loss_mlp": 1.12379456, + "epoch": 0.20353982300884957, + "flos": 644206804992.0, + "grad_norm": 0.0943960049444156, + "language_loss": 0.84853089, + "learning_rate": 0.000923089573458227, + "loss": 0.85997093, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.2019043, + "step": 1058, + "time_per_iteration": 2.8817007541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152473, + "balance_loss_mlp": 1.13152814, + "epoch": 0.20373220469411313, + "flos": 651421522944.0, + "grad_norm": 0.0957717786757319, + "language_loss": 0.83558518, + "learning_rate": 0.0009229234708986392, + "loss": 0.84710991, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.20947266, + "step": 1059, + "time_per_iteration": 2.9059059619903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179467, + "balance_loss_mlp": 1.1632545, + "epoch": 0.2039245863793767, + "flos": 1437628787712.0, + "grad_norm": 0.05660436116329576, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.82846367, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.16210938, + "step": 1060, + "time_per_iteration": 4.709235429763794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158087, + "balance_loss_mlp": 1.13642621, + "epoch": 0.20411696806464025, + "flos": 596967169536.0, + "grad_norm": 0.07273861691254356, + "language_loss": 0.84919071, + "learning_rate": 0.0009225907732636548, + "loss": 0.86077166, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.2166748, + "step": 1061, + "time_per_iteration": 2.7832870483398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170458, + "balance_loss_mlp": 1.14922678, + "epoch": 0.2043093497499038, + "flos": 573803274240.0, + "grad_norm": 0.10826308082162117, + "language_loss": 0.86149454, + "learning_rate": 0.0009224241783174227, + "loss": 0.87319911, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.21252441, + "step": 1062, + "time_per_iteration": 2.7493624687194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116711, + "balance_loss_mlp": 1.14574718, + "epoch": 0.20450173143516737, + "flos": 630352958976.0, + "grad_norm": 0.0807963285895634, + "language_loss": 0.85689318, + "learning_rate": 0.0009222574193715802, + "loss": 0.86856437, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.21374512, + "step": 1063, + "time_per_iteration": 2.8018240928649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159842, + "balance_loss_mlp": 1.13889694, + "epoch": 0.20469411312043093, + "flos": 574003335168.0, + "grad_norm": 0.07340119955686962, + "language_loss": 0.85735941, + "learning_rate": 0.000922090496490869, + "loss": 0.86895782, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.20947266, + "step": 1064, + "time_per_iteration": 2.765749931335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152698, + "balance_loss_mlp": 1.13164544, + "epoch": 0.20488649480569449, + "flos": 637053755904.0, + "grad_norm": 0.07242145518951734, + "language_loss": 0.89867234, + "learning_rate": 0.0009219234097400937, + "loss": 0.9101994, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.21057129, + "step": 1065, + "time_per_iteration": 2.8627817630767822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114786, + "balance_loss_mlp": 1.12674773, + "epoch": 0.20507887649095807, + "flos": 975793526784.0, + "grad_norm": 0.08464925787423999, + "language_loss": 0.83060288, + "learning_rate": 0.0009217561591841237, + "loss": 0.84208149, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.21130371, + "step": 1066, + "time_per_iteration": 3.3423283100128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142136, + "balance_loss_mlp": 1.12129867, + "epoch": 0.20527125817622163, + "flos": 486183015936.0, + "grad_norm": 0.08558035413219019, + "language_loss": 0.80671912, + "learning_rate": 0.0009215887448878913, + "loss": 0.81814051, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.20849609, + "step": 1067, + "time_per_iteration": 2.5908420085906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133301, + "balance_loss_mlp": 1.11204648, + "epoch": 0.2054636398614852, + "flos": 527178875904.0, + "grad_norm": 0.08226430294551884, + "language_loss": 0.8469618, + "learning_rate": 0.0009214211669163922, + "loss": 0.85829484, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.21264648, + "step": 1068, + "time_per_iteration": 2.70798397064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136986, + "balance_loss_mlp": 1.11625564, + "epoch": 0.20565602154674875, + "flos": 558182638080.0, + "grad_norm": 0.08433693913464968, + "language_loss": 0.9379245, + "learning_rate": 0.0009212534253346862, + "loss": 0.94929433, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.20727539, + "step": 1069, + "time_per_iteration": 2.7776713371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129003, + "balance_loss_mlp": 1.10772455, + "epoch": 0.2058484032320123, + "flos": 504224953344.0, + "grad_norm": 0.09450930819857521, + "language_loss": 0.8384515, + "learning_rate": 0.0009210855202078964, + "loss": 0.84974158, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.2130127, + "step": 1070, + "time_per_iteration": 2.6283328533172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130904, + "balance_loss_mlp": 1.11017382, + "epoch": 0.20604078491727587, + "flos": 433169109504.0, + "grad_norm": 0.08132695111234396, + "language_loss": 0.86854172, + "learning_rate": 0.0009209174516012091, + "loss": 0.87985075, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.20751953, + "step": 1071, + "time_per_iteration": 2.535447120666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133709, + "balance_loss_mlp": 1.11270416, + "epoch": 0.20623316660253943, + "flos": 608711003136.0, + "grad_norm": 0.11111326067788187, + "language_loss": 0.88662505, + "learning_rate": 0.0009207492195798747, + "loss": 0.89796209, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.21008301, + "step": 1072, + "time_per_iteration": 2.7883682250976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144236, + "balance_loss_mlp": 1.12275457, + "epoch": 0.206425548287803, + "flos": 480425997312.0, + "grad_norm": 0.10819626667436329, + "language_loss": 0.84654653, + "learning_rate": 0.0009205808242092061, + "loss": 0.85798889, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.21484375, + "step": 1073, + "time_per_iteration": 2.6761436462402344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166479, + "balance_loss_mlp": 1.1445806, + "epoch": 0.20661792997306658, + "flos": 949429734912.0, + "grad_norm": 0.10070475961417262, + "language_loss": 0.82806575, + "learning_rate": 0.0009204122655545808, + "loss": 0.8397305, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.21911621, + "step": 1074, + "time_per_iteration": 3.326646089553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169219, + "balance_loss_mlp": 1.14714098, + "epoch": 0.20681031165833014, + "flos": 603487729152.0, + "grad_norm": 0.07526963641038939, + "language_loss": 0.80370897, + "learning_rate": 0.0009202435436814388, + "loss": 0.8154012, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.22070312, + "step": 1075, + "time_per_iteration": 2.718374013900757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117617, + "balance_loss_mlp": 1.15484309, + "epoch": 0.2070026933435937, + "flos": 708984368640.0, + "grad_norm": 0.08141199692544657, + "language_loss": 0.89125872, + "learning_rate": 0.0009200746586552836, + "loss": 0.90302044, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.21350098, + "step": 1076, + "time_per_iteration": 2.9237890243530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116406, + "balance_loss_mlp": 1.14320993, + "epoch": 0.20719507502885726, + "flos": 829814948352.0, + "grad_norm": 0.08915437819246362, + "language_loss": 0.83578765, + "learning_rate": 0.0009199056105416825, + "loss": 0.8474282, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.20861816, + "step": 1077, + "time_per_iteration": 3.1017873287200928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174288, + "balance_loss_mlp": 1.15383148, + "epoch": 0.20738745671412082, + "flos": 638294141952.0, + "grad_norm": 0.08235458210831342, + "language_loss": 0.8621031, + "learning_rate": 0.0009197363994062654, + "loss": 0.87384599, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.20458984, + "step": 1078, + "time_per_iteration": 2.832416296005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115862, + "balance_loss_mlp": 1.13828301, + "epoch": 0.20757983839938438, + "flos": 685602786816.0, + "grad_norm": 0.14524602294533026, + "language_loss": 0.8378703, + "learning_rate": 0.0009195670253147262, + "loss": 0.84945655, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.20336914, + "step": 1079, + "time_per_iteration": 2.9912445545196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130057, + "balance_loss_mlp": 1.11056602, + "epoch": 0.20777222008464794, + "flos": 519282109440.0, + "grad_norm": 0.07398728313760368, + "language_loss": 0.81629539, + "learning_rate": 0.0009193974883328216, + "loss": 0.82759595, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.19470215, + "step": 1080, + "time_per_iteration": 2.636516809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142545, + "balance_loss_mlp": 1.12286365, + "epoch": 0.2079646017699115, + "flos": 511402595328.0, + "grad_norm": 0.08145379169955597, + "language_loss": 0.86828917, + "learning_rate": 0.0009192277885263718, + "loss": 0.87971467, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.19665527, + "step": 1081, + "time_per_iteration": 2.7361197471618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137254, + "balance_loss_mlp": 1.11765575, + "epoch": 0.20815698345517505, + "flos": 931820226048.0, + "grad_norm": 0.09498097190043973, + "language_loss": 0.85732365, + "learning_rate": 0.0009190579259612602, + "loss": 0.86869615, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.19580078, + "step": 1082, + "time_per_iteration": 3.3791959285736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156614, + "balance_loss_mlp": 1.13621759, + "epoch": 0.20834936514043864, + "flos": 632401302528.0, + "grad_norm": 0.1488703614850634, + "language_loss": 0.86399055, + "learning_rate": 0.000918887900703433, + "loss": 0.87555665, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.20397949, + "step": 1083, + "time_per_iteration": 2.8133795261383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148544, + "balance_loss_mlp": 1.129125, + "epoch": 0.2085417468257022, + "flos": 394384578048.0, + "grad_norm": 0.0859641513447352, + "language_loss": 0.90200919, + "learning_rate": 0.0009187177128188999, + "loss": 0.91349459, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.19396973, + "step": 1084, + "time_per_iteration": 2.4999842643737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01286106, + "balance_loss_mlp": 1.27151525, + "epoch": 0.20873412851096576, + "flos": 1402147293696.0, + "grad_norm": 0.08105811039849961, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78442645, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.14550781, + "step": 1085, + "time_per_iteration": 4.8958563804626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153669, + "balance_loss_mlp": 1.13441706, + "epoch": 0.20892651019622932, + "flos": 447830913024.0, + "grad_norm": 0.08197687066157772, + "language_loss": 0.85811758, + "learning_rate": 0.000918376849434071, + "loss": 0.86965424, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.19250488, + "step": 1086, + "time_per_iteration": 2.5344736576080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118214, + "balance_loss_mlp": 1.16158867, + "epoch": 0.20911889188149288, + "flos": 493106268672.0, + "grad_norm": 0.10825532619194118, + "language_loss": 0.90649915, + "learning_rate": 0.0009182061740661098, + "loss": 0.9183206, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.20556641, + "step": 1087, + "time_per_iteration": 2.5707151889801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178442, + "balance_loss_mlp": 1.15811718, + "epoch": 0.20931127356675644, + "flos": 841291909632.0, + "grad_norm": 0.08160475290131898, + "language_loss": 0.84683895, + "learning_rate": 0.0009180353363361127, + "loss": 0.85862345, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.203125, + "step": 1088, + "time_per_iteration": 3.137329339981079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174451, + "balance_loss_mlp": 1.15374422, + "epoch": 0.20950365525202, + "flos": 757140618240.0, + "grad_norm": 0.10140667942926032, + "language_loss": 0.81920874, + "learning_rate": 0.0009178643363104044, + "loss": 0.83095324, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.20715332, + "step": 1089, + "time_per_iteration": 3.1493358612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147137, + "balance_loss_mlp": 1.12660897, + "epoch": 0.20969603693728356, + "flos": 472539142656.0, + "grad_norm": 0.10442412310556573, + "language_loss": 0.90355861, + "learning_rate": 0.0009176931740553735, + "loss": 0.91503, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.20532227, + "step": 1090, + "time_per_iteration": 2.5557990074157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139677, + "balance_loss_mlp": 1.11933959, + "epoch": 0.20988841862254715, + "flos": 976930025472.0, + "grad_norm": 0.17656839042402708, + "language_loss": 0.82232946, + "learning_rate": 0.0009175218496374708, + "loss": 0.83372623, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.20349121, + "step": 1091, + "time_per_iteration": 3.3492214679718018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132775, + "balance_loss_mlp": 1.11287904, + "epoch": 0.2100808003078107, + "flos": 1093120634880.0, + "grad_norm": 0.09269359078641065, + "language_loss": 0.85681468, + "learning_rate": 0.0009173503631232103, + "loss": 0.86814249, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.19885254, + "step": 1092, + "time_per_iteration": 3.396247386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131208, + "balance_loss_mlp": 1.11091864, + "epoch": 0.21027318199307427, + "flos": 1012964714496.0, + "grad_norm": 0.09283462310009857, + "language_loss": 0.81684232, + "learning_rate": 0.0009171787145791691, + "loss": 0.82815444, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.20288086, + "step": 1093, + "time_per_iteration": 3.2441000938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132702, + "balance_loss_mlp": 1.11279404, + "epoch": 0.21046556367833782, + "flos": 521394693120.0, + "grad_norm": 0.14183927725725606, + "language_loss": 0.79456544, + "learning_rate": 0.000917006904071987, + "loss": 0.80589247, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.19897461, + "step": 1094, + "time_per_iteration": 2.658992052078247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140578, + "balance_loss_mlp": 1.12040734, + "epoch": 0.21065794536360138, + "flos": 603717525504.0, + "grad_norm": 0.07963562881698232, + "language_loss": 0.86590552, + "learning_rate": 0.0009168349316683669, + "loss": 0.87731135, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.20166016, + "step": 1095, + "time_per_iteration": 2.7208545207977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157939, + "balance_loss_mlp": 1.1382103, + "epoch": 0.21085032704886494, + "flos": 603346765824.0, + "grad_norm": 0.06948100196361624, + "language_loss": 0.82885933, + "learning_rate": 0.0009166627974350741, + "loss": 0.84043866, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.19714355, + "step": 1096, + "time_per_iteration": 2.879690647125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158751, + "balance_loss_mlp": 1.13850892, + "epoch": 0.2110427087341285, + "flos": 637671564288.0, + "grad_norm": 0.07894738519235364, + "language_loss": 0.89620626, + "learning_rate": 0.0009164905014389373, + "loss": 0.90779376, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.20239258, + "step": 1097, + "time_per_iteration": 2.7915890216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174722, + "balance_loss_mlp": 1.15442061, + "epoch": 0.21123509041939206, + "flos": 522919203840.0, + "grad_norm": 0.08089010798718275, + "language_loss": 0.86655492, + "learning_rate": 0.0009163180437468476, + "loss": 0.87830216, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.20300293, + "step": 1098, + "time_per_iteration": 2.671910285949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160878, + "balance_loss_mlp": 1.14083886, + "epoch": 0.21142747210465565, + "flos": 451188652032.0, + "grad_norm": 0.1273171739233691, + "language_loss": 0.85848475, + "learning_rate": 0.000916145424425759, + "loss": 0.87009346, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.20031738, + "step": 1099, + "time_per_iteration": 2.718719959259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138682, + "balance_loss_mlp": 1.11927521, + "epoch": 0.2116198537899192, + "flos": 876175045632.0, + "grad_norm": 0.11827895321179892, + "language_loss": 0.90551817, + "learning_rate": 0.0009159726435426885, + "loss": 0.91690505, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.19384766, + "step": 1100, + "time_per_iteration": 4.622005939483643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096537, + "balance_loss_mlp": 1.07577038, + "epoch": 0.21181223547518277, + "flos": 523662921216.0, + "grad_norm": 0.08009025902543959, + "language_loss": 0.90283167, + "learning_rate": 0.0009157997011647154, + "loss": 0.91379714, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.2076416, + "step": 1101, + "time_per_iteration": 2.605741262435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082643, + "balance_loss_mlp": 1.0622586, + "epoch": 0.21200461716044633, + "flos": 572296015872.0, + "grad_norm": 0.10006580652166666, + "language_loss": 0.85976642, + "learning_rate": 0.0009156265973589817, + "loss": 0.87059283, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.20385742, + "step": 1102, + "time_per_iteration": 2.7997629642486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082906, + "balance_loss_mlp": 1.06256843, + "epoch": 0.2121969988457099, + "flos": 545129409024.0, + "grad_norm": 0.08882618780300273, + "language_loss": 0.89710194, + "learning_rate": 0.0009154533321926926, + "loss": 0.90793097, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.20336914, + "step": 1103, + "time_per_iteration": 2.6505167484283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082845, + "balance_loss_mlp": 1.06240106, + "epoch": 0.21238938053097345, + "flos": 843861514752.0, + "grad_norm": 0.08104008133152642, + "language_loss": 0.87105876, + "learning_rate": 0.0009152799057331156, + "loss": 0.88188726, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.20446777, + "step": 1104, + "time_per_iteration": 3.16381573677063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085023, + "balance_loss_mlp": 1.06503153, + "epoch": 0.212581762216237, + "flos": 446214998016.0, + "grad_norm": 0.1303184369793021, + "language_loss": 0.90978825, + "learning_rate": 0.0009151063180475805, + "loss": 0.92063844, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.1998291, + "step": 1105, + "time_per_iteration": 2.519392490386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081303, + "balance_loss_mlp": 1.06139469, + "epoch": 0.21277414390150057, + "flos": 514380036096.0, + "grad_norm": 0.09253503988008102, + "language_loss": 0.84230483, + "learning_rate": 0.0009149325692034803, + "loss": 0.85311788, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.19897461, + "step": 1106, + "time_per_iteration": 2.623030662536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122847, + "balance_loss_mlp": 1.11054456, + "epoch": 0.21296652558676413, + "flos": 1485532846080.0, + "grad_norm": 0.03239256029122438, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80326271, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.12304688, + "step": 1107, + "time_per_iteration": 4.865934610366821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095405, + "balance_loss_mlp": 1.07612848, + "epoch": 0.21315890727202771, + "flos": 846040909824.0, + "grad_norm": 0.08663251382833077, + "language_loss": 0.87545854, + "learning_rate": 0.0009145845883094678, + "loss": 0.88641262, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.19262695, + "step": 1108, + "time_per_iteration": 3.0644633769989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106513, + "balance_loss_mlp": 1.08767843, + "epoch": 0.21335128895729127, + "flos": 629379445248.0, + "grad_norm": 0.09154471330204571, + "language_loss": 0.84864843, + "learning_rate": 0.000914410356394654, + "loss": 0.85971349, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.18798828, + "step": 1109, + "time_per_iteration": 2.7818005084991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111664, + "balance_loss_mlp": 1.09850883, + "epoch": 0.21354367064255483, + "flos": 710975812608.0, + "grad_norm": 0.05901208331379503, + "language_loss": 0.84397328, + "learning_rate": 0.0009142359635914709, + "loss": 0.85513967, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.18151855, + "step": 1110, + "time_per_iteration": 3.0699398517608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132455, + "balance_loss_mlp": 1.11437058, + "epoch": 0.2137360523278184, + "flos": 456201953280.0, + "grad_norm": 0.07045633933043649, + "language_loss": 0.84396905, + "learning_rate": 0.0009140614099676245, + "loss": 0.85529351, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.18103027, + "step": 1111, + "time_per_iteration": 2.6896469593048096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144685, + "balance_loss_mlp": 1.12654102, + "epoch": 0.21392843401308195, + "flos": 666051393024.0, + "grad_norm": 0.07609754946919366, + "language_loss": 0.82333195, + "learning_rate": 0.0009138866955908821, + "loss": 0.83477879, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.18151855, + "step": 1112, + "time_per_iteration": 2.9167656898498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173372, + "balance_loss_mlp": 1.15541935, + "epoch": 0.2141208156983455, + "flos": 748996803072.0, + "grad_norm": 0.07536024812721688, + "language_loss": 0.80650687, + "learning_rate": 0.0009137118205290738, + "loss": 0.81824064, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.17956543, + "step": 1113, + "time_per_iteration": 3.038858652114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173379, + "balance_loss_mlp": 1.15471053, + "epoch": 0.21431319738360907, + "flos": 419119971840.0, + "grad_norm": 0.08578166607433227, + "language_loss": 0.9008798, + "learning_rate": 0.0009135367848500924, + "loss": 0.91261363, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.18652344, + "step": 1114, + "time_per_iteration": 2.5301332473754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183524, + "balance_loss_mlp": 1.16561842, + "epoch": 0.21450557906887263, + "flos": 609126179328.0, + "grad_norm": 0.097679735811004, + "language_loss": 0.86396897, + "learning_rate": 0.0009133615886218927, + "loss": 0.87580419, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.17932129, + "step": 1115, + "time_per_iteration": 2.7787675857543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181259, + "balance_loss_mlp": 1.16279316, + "epoch": 0.21469796075413622, + "flos": 561913708032.0, + "grad_norm": 0.08896664083513224, + "language_loss": 0.87571919, + "learning_rate": 0.0009131862319124917, + "loss": 0.88753176, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.18469238, + "step": 1116, + "time_per_iteration": 2.7031164169311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177922, + "balance_loss_mlp": 1.15970659, + "epoch": 0.21489034243939978, + "flos": 594637272576.0, + "grad_norm": 0.07771910148821705, + "language_loss": 0.8379603, + "learning_rate": 0.0009130107147899691, + "loss": 0.84973955, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.18237305, + "step": 1117, + "time_per_iteration": 2.7842912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180049, + "balance_loss_mlp": 1.16186976, + "epoch": 0.21508272412466334, + "flos": 441898426368.0, + "grad_norm": 0.07252648730513606, + "language_loss": 0.85351467, + "learning_rate": 0.0009128350373224665, + "loss": 0.86531514, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.1817627, + "step": 1118, + "time_per_iteration": 2.547067880630493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174302, + "balance_loss_mlp": 1.1582799, + "epoch": 0.2152751058099269, + "flos": 1496162202624.0, + "grad_norm": 0.06807222888709992, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.8263073, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.16015625, + "step": 1119, + "time_per_iteration": 4.686914443969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191791, + "balance_loss_mlp": 1.1730994, + "epoch": 0.21546748749519046, + "flos": 494005630464.0, + "grad_norm": 0.07584418562153701, + "language_loss": 0.85298818, + "learning_rate": 0.0009124832016254005, + "loss": 0.86490607, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.18676758, + "step": 1120, + "time_per_iteration": 2.594407558441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179476, + "balance_loss_mlp": 1.16062903, + "epoch": 0.21565986918045402, + "flos": 634531138560.0, + "grad_norm": 0.07950209413805702, + "language_loss": 0.87972558, + "learning_rate": 0.0009123070435324316, + "loss": 0.89152032, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.18835449, + "step": 1121, + "time_per_iteration": 2.8215177059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068291, + "balance_loss_mlp": 1.05379486, + "epoch": 0.21585225086571758, + "flos": 1583359570944.0, + "grad_norm": 0.028005803680130233, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78944069, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.14453125, + "step": 1122, + "time_per_iteration": 5.0041632652282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159249, + "balance_loss_mlp": 1.14079511, + "epoch": 0.21604463255098114, + "flos": 684103242240.0, + "grad_norm": 0.08251943361984397, + "language_loss": 0.86073762, + "learning_rate": 0.0009119542471995752, + "loss": 0.87233007, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.18432617, + "step": 1123, + "time_per_iteration": 2.862286329269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163328, + "balance_loss_mlp": 1.14537501, + "epoch": 0.2162370142362447, + "flos": 780989133312.0, + "grad_norm": 0.09258223897772182, + "language_loss": 0.81420332, + "learning_rate": 0.0009117776090966554, + "loss": 0.8258366, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.17956543, + "step": 1124, + "time_per_iteration": 2.957061767578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178568, + "balance_loss_mlp": 1.15982795, + "epoch": 0.21642939592150828, + "flos": 1002147406848.0, + "grad_norm": 0.08713542738122697, + "language_loss": 0.86376691, + "learning_rate": 0.0009116008111274899, + "loss": 0.87555259, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.18725586, + "step": 1125, + "time_per_iteration": 3.2553656101226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134598, + "balance_loss_mlp": 1.12191415, + "epoch": 0.21662177760677184, + "flos": 1482644238336.0, + "grad_norm": 0.04404830998294008, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.80241525, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.12695312, + "step": 1126, + "time_per_iteration": 4.808468818664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178721, + "balance_loss_mlp": 1.16074455, + "epoch": 0.2168141592920354, + "flos": 887395046400.0, + "grad_norm": 0.11245559393918578, + "language_loss": 0.8463136, + "learning_rate": 0.0009112467358650396, + "loss": 0.85810077, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.17993164, + "step": 1127, + "time_per_iteration": 3.2135119438171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203773, + "balance_loss_mlp": 1.18573689, + "epoch": 0.21700654097729896, + "flos": 545961959424.0, + "grad_norm": 0.12344639465473216, + "language_loss": 0.86497682, + "learning_rate": 0.0009110694587092192, + "loss": 0.87701452, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.18041992, + "step": 1128, + "time_per_iteration": 2.76655650138855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187728, + "balance_loss_mlp": 1.17007267, + "epoch": 0.21719892266256252, + "flos": 509522379264.0, + "grad_norm": 0.08979647183610162, + "language_loss": 0.81230694, + "learning_rate": 0.0009108920219620815, + "loss": 0.82418424, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.17675781, + "step": 1129, + "time_per_iteration": 2.654778242111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213499, + "balance_loss_mlp": 1.19534314, + "epoch": 0.21739130434782608, + "flos": 543412177920.0, + "grad_norm": 0.09421163362280094, + "language_loss": 0.89139944, + "learning_rate": 0.0009107144256925133, + "loss": 0.90353441, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.18164062, + "step": 1130, + "time_per_iteration": 2.6828513145446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118696, + "balance_loss_mlp": 1.1690309, + "epoch": 0.21758368603308964, + "flos": 616847477760.0, + "grad_norm": 0.10043397732842237, + "language_loss": 0.82135975, + "learning_rate": 0.0009105366699694638, + "loss": 0.83322936, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.17944336, + "step": 1131, + "time_per_iteration": 2.7368264198303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156124, + "balance_loss_mlp": 1.13807523, + "epoch": 0.2177760677183532, + "flos": 635116640256.0, + "grad_norm": 0.06866995192565088, + "language_loss": 0.8126269, + "learning_rate": 0.0009103587548619439, + "loss": 0.82418817, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.18066406, + "step": 1132, + "time_per_iteration": 2.8550221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127864, + "balance_loss_mlp": 1.10951805, + "epoch": 0.2179684494036168, + "flos": 532463818752.0, + "grad_norm": 0.07626365128544196, + "language_loss": 0.85966831, + "learning_rate": 0.0009101806804390261, + "loss": 0.87094694, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.18359375, + "step": 1133, + "time_per_iteration": 2.865067720413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104426, + "balance_loss_mlp": 1.08616304, + "epoch": 0.21816083108888035, + "flos": 475219975680.0, + "grad_norm": 0.0835029551695644, + "language_loss": 0.89787459, + "learning_rate": 0.0009100024467698453, + "loss": 0.90891886, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.18261719, + "step": 1134, + "time_per_iteration": 2.587308645248413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107106, + "balance_loss_mlp": 1.08858073, + "epoch": 0.2183532127741439, + "flos": 577467532800.0, + "grad_norm": 0.1261525750794289, + "language_loss": 0.8228271, + "learning_rate": 0.0009098240539235981, + "loss": 0.83389813, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.1854248, + "step": 1135, + "time_per_iteration": 2.672178268432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118389, + "balance_loss_mlp": 1.10042465, + "epoch": 0.21854559445940747, + "flos": 594120780288.0, + "grad_norm": 0.07190677595982913, + "language_loss": 0.87357873, + "learning_rate": 0.0009096455019695423, + "loss": 0.88476264, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.17980957, + "step": 1136, + "time_per_iteration": 2.7987098693847656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132882, + "balance_loss_mlp": 1.1147505, + "epoch": 0.21873797614467103, + "flos": 408680764416.0, + "grad_norm": 0.07940180090442328, + "language_loss": 0.89624888, + "learning_rate": 0.000909466790976998, + "loss": 0.90757769, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.18139648, + "step": 1137, + "time_per_iteration": 2.477332830429077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135864, + "balance_loss_mlp": 1.11760151, + "epoch": 0.21893035782993459, + "flos": 894189818880.0, + "grad_norm": 0.0834179991172278, + "language_loss": 0.82063508, + "learning_rate": 0.0009092879210153473, + "loss": 0.83199376, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.18261719, + "step": 1138, + "time_per_iteration": 3.12052321434021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144384, + "balance_loss_mlp": 1.12646723, + "epoch": 0.21912273951519814, + "flos": 467627157504.0, + "grad_norm": 0.08144398942367967, + "language_loss": 0.88541782, + "learning_rate": 0.0009091088921540333, + "loss": 0.89686167, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.17919922, + "step": 1139, + "time_per_iteration": 2.616718292236328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059921, + "balance_loss_mlp": 1.04833436, + "epoch": 0.2193151212004617, + "flos": 1532043445248.0, + "grad_norm": 0.03144960121690337, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76568598, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.11572266, + "step": 1140, + "time_per_iteration": 4.950219392776489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159199, + "balance_loss_mlp": 1.14099586, + "epoch": 0.2195075028857253, + "flos": 591175646208.0, + "grad_norm": 0.08175747516698374, + "language_loss": 0.84013134, + "learning_rate": 0.0009087503580104985, + "loss": 0.85172331, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.18212891, + "step": 1141, + "time_per_iteration": 2.7156832218170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169076, + "balance_loss_mlp": 1.15111113, + "epoch": 0.21969988457098885, + "flos": 636329862144.0, + "grad_norm": 0.09158845445189351, + "language_loss": 0.7908268, + "learning_rate": 0.0009085708528674728, + "loss": 0.80251753, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.17993164, + "step": 1142, + "time_per_iteration": 2.7931153774261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164556, + "balance_loss_mlp": 1.14653111, + "epoch": 0.2198922662562524, + "flos": 912350324736.0, + "grad_norm": 0.08286913258708346, + "language_loss": 0.86118239, + "learning_rate": 0.0009083911891031745, + "loss": 0.87282795, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.18041992, + "step": 1143, + "time_per_iteration": 3.116783857345581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117374, + "balance_loss_mlp": 1.15575087, + "epoch": 0.22008464794151597, + "flos": 822980528640.0, + "grad_norm": 0.10598120448533326, + "language_loss": 0.91152728, + "learning_rate": 0.0009082113667873553, + "loss": 0.92326462, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.18005371, + "step": 1144, + "time_per_iteration": 3.1333653926849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165589, + "balance_loss_mlp": 1.14781499, + "epoch": 0.22027702962677953, + "flos": 459656239104.0, + "grad_norm": 0.09559133609898889, + "language_loss": 0.9010762, + "learning_rate": 0.0009080313859898283, + "loss": 0.91273212, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.17773438, + "step": 1145, + "time_per_iteration": 2.5269837379455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158069, + "balance_loss_mlp": 1.13981819, + "epoch": 0.2204694113120431, + "flos": 531255739392.0, + "grad_norm": 0.08379728657337264, + "language_loss": 0.91627228, + "learning_rate": 0.0009078512467804684, + "loss": 0.92785299, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.18249512, + "step": 1146, + "time_per_iteration": 2.6481103897094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115684, + "balance_loss_mlp": 1.13930488, + "epoch": 0.22066179299730665, + "flos": 522642419712.0, + "grad_norm": 0.08494148813195015, + "language_loss": 0.90029317, + "learning_rate": 0.0009076709492292119, + "loss": 0.91186154, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.17541504, + "step": 1147, + "time_per_iteration": 2.659444808959961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156044, + "balance_loss_mlp": 1.1380074, + "epoch": 0.2208541746825702, + "flos": 546451287552.0, + "grad_norm": 0.08635236800942281, + "language_loss": 0.88836294, + "learning_rate": 0.0009074904934060562, + "loss": 0.89992332, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.18041992, + "step": 1148, + "time_per_iteration": 2.6803669929504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154403, + "balance_loss_mlp": 1.13666439, + "epoch": 0.22104655636783377, + "flos": 708734748672.0, + "grad_norm": 0.0889091403520225, + "language_loss": 0.84333098, + "learning_rate": 0.0009073098793810607, + "loss": 0.85487497, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.17749023, + "step": 1149, + "time_per_iteration": 2.9655888080596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142518, + "balance_loss_mlp": 1.12488723, + "epoch": 0.22123893805309736, + "flos": 584867630592.0, + "grad_norm": 0.1004212836055253, + "language_loss": 0.88171208, + "learning_rate": 0.000907129107224346, + "loss": 0.89313722, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.17651367, + "step": 1150, + "time_per_iteration": 2.7072501182556152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114998, + "balance_loss_mlp": 1.13255119, + "epoch": 0.22143131973836092, + "flos": 492251323392.0, + "grad_norm": 0.06570196764831916, + "language_loss": 0.88176614, + "learning_rate": 0.0009069481770060939, + "loss": 0.8932659, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.17443848, + "step": 1151, + "time_per_iteration": 2.685103178024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154372, + "balance_loss_mlp": 1.13708711, + "epoch": 0.22162370142362448, + "flos": 1079674251264.0, + "grad_norm": 0.09650141097201487, + "language_loss": 0.83268076, + "learning_rate": 0.000906767088796548, + "loss": 0.84422451, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.17297363, + "step": 1152, + "time_per_iteration": 3.4740118980407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116577, + "balance_loss_mlp": 1.14875841, + "epoch": 0.22181608310888803, + "flos": 492508283904.0, + "grad_norm": 0.08954893541671843, + "language_loss": 0.86883795, + "learning_rate": 0.0009065858426660127, + "loss": 0.88049567, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.17028809, + "step": 1153, + "time_per_iteration": 2.6959545612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162713, + "balance_loss_mlp": 1.14552331, + "epoch": 0.2220084647941516, + "flos": 724014360576.0, + "grad_norm": 0.08642937771359972, + "language_loss": 0.84477949, + "learning_rate": 0.0009064044386848543, + "loss": 0.85640663, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.17199707, + "step": 1154, + "time_per_iteration": 2.9327309131622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148113, + "balance_loss_mlp": 1.13044643, + "epoch": 0.22220084647941515, + "flos": 489239377920.0, + "grad_norm": 0.10097530204718137, + "language_loss": 0.8819679, + "learning_rate": 0.0009062228769234997, + "loss": 0.89344907, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.17675781, + "step": 1155, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131691, + "balance_loss_mlp": 1.11384535, + "epoch": 0.2223932281646787, + "flos": 536278952448.0, + "grad_norm": 0.08570412042921306, + "language_loss": 0.80458236, + "learning_rate": 0.0009060411574524376, + "loss": 0.81589925, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.17858887, + "step": 1156, + "time_per_iteration": 2.6829988956451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121054, + "balance_loss_mlp": 1.10336328, + "epoch": 0.22258560984994227, + "flos": 931420104192.0, + "grad_norm": 0.09330017299295373, + "language_loss": 0.87879562, + "learning_rate": 0.0009058592803422178, + "loss": 0.89000618, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.17712402, + "step": 1157, + "time_per_iteration": 3.181018829345703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121897, + "balance_loss_mlp": 1.10911822, + "epoch": 0.22277799153520586, + "flos": 1199675930112.0, + "grad_norm": 0.048914379983556036, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79832184, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.12792969, + "step": 1158, + "time_per_iteration": 4.887088775634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115665, + "balance_loss_mlp": 1.0982244, + "epoch": 0.22297037322046942, + "flos": 501304412160.0, + "grad_norm": 0.0696072904806853, + "language_loss": 0.89700031, + "learning_rate": 0.00090549505348681, + "loss": 0.90815699, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.17456055, + "step": 1159, + "time_per_iteration": 2.598071813583374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112274, + "balance_loss_mlp": 1.09486985, + "epoch": 0.22316275490573298, + "flos": 752752465920.0, + "grad_norm": 0.12380497141241992, + "language_loss": 0.83892691, + "learning_rate": 0.0009053127038830275, + "loss": 0.85004961, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.17407227, + "step": 1160, + "time_per_iteration": 2.972153663635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105235, + "balance_loss_mlp": 1.08817601, + "epoch": 0.22335513659099654, + "flos": 514802552832.0, + "grad_norm": 0.11211348915152936, + "language_loss": 0.86961317, + "learning_rate": 0.000905130196922898, + "loss": 0.88066548, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.17077637, + "step": 1161, + "time_per_iteration": 2.586404800415039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103766, + "balance_loss_mlp": 1.08674335, + "epoch": 0.2235475182762601, + "flos": 484530024960.0, + "grad_norm": 0.08844003676149725, + "language_loss": 0.8712495, + "learning_rate": 0.0009049475326772769, + "loss": 0.88228714, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.17028809, + "step": 1162, + "time_per_iteration": 2.633775472640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115059, + "balance_loss_mlp": 1.09810734, + "epoch": 0.22373989996152366, + "flos": 469971735552.0, + "grad_norm": 0.08335674073816261, + "language_loss": 0.83002663, + "learning_rate": 0.0009047647112170811, + "loss": 0.84117723, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.16967773, + "step": 1163, + "time_per_iteration": 2.779890537261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112691, + "balance_loss_mlp": 1.11049509, + "epoch": 0.22393228164678722, + "flos": 1271012249088.0, + "grad_norm": 0.19679577404354898, + "language_loss": 0.87137246, + "learning_rate": 0.0009045817326132876, + "loss": 0.88264161, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.16418457, + "step": 1164, + "time_per_iteration": 3.703150749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153627, + "balance_loss_mlp": 1.13630629, + "epoch": 0.22412466333205078, + "flos": 596334680064.0, + "grad_norm": 0.08115041291567808, + "language_loss": 0.83409214, + "learning_rate": 0.0009043985969369357, + "loss": 0.84562844, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.17333984, + "step": 1165, + "time_per_iteration": 2.8744845390319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175693, + "balance_loss_mlp": 1.15849137, + "epoch": 0.22431704501731436, + "flos": 608434219008.0, + "grad_norm": 0.06201627876445988, + "language_loss": 0.84104788, + "learning_rate": 0.0009042153042591245, + "loss": 0.85280478, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.17224121, + "step": 1166, + "time_per_iteration": 2.8617310523986816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184386, + "balance_loss_mlp": 1.16719604, + "epoch": 0.22450942670257792, + "flos": 906583394304.0, + "grad_norm": 0.08223980595448348, + "language_loss": 0.84917307, + "learning_rate": 0.0009040318546510146, + "loss": 0.86101699, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.17211914, + "step": 1167, + "time_per_iteration": 3.1852662563323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184421, + "balance_loss_mlp": 1.16730213, + "epoch": 0.22470180838784148, + "flos": 565301182464.0, + "grad_norm": 0.0789242941151387, + "language_loss": 0.85142338, + "learning_rate": 0.0009038482481838275, + "loss": 0.86326754, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.17126465, + "step": 1168, + "time_per_iteration": 2.69252347946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179663, + "balance_loss_mlp": 1.16241312, + "epoch": 0.22489419007310504, + "flos": 834469972992.0, + "grad_norm": 0.05697426763288438, + "language_loss": 0.86826229, + "learning_rate": 0.0009036644849288455, + "loss": 0.88005894, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.17260742, + "step": 1169, + "time_per_iteration": 3.1488285064697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174012, + "balance_loss_mlp": 1.15652442, + "epoch": 0.2250865717583686, + "flos": 581057639424.0, + "grad_norm": 0.08495924937221859, + "language_loss": 0.85084724, + "learning_rate": 0.0009034805649574118, + "loss": 0.86258733, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.1751709, + "step": 1170, + "time_per_iteration": 2.685328722000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183548, + "balance_loss_mlp": 1.16578627, + "epoch": 0.22527895344363216, + "flos": 600406401024.0, + "grad_norm": 0.11014400581169416, + "language_loss": 0.85017669, + "learning_rate": 0.0009032964883409308, + "loss": 0.86201215, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.17785645, + "step": 1171, + "time_per_iteration": 2.879601240158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.10170817, + "epoch": 0.22547133512889572, + "flos": 1440751587840.0, + "grad_norm": 0.052120324196256125, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74164546, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.12255859, + "step": 1172, + "time_per_iteration": 5.038167715072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198228, + "balance_loss_mlp": 1.18021595, + "epoch": 0.22566371681415928, + "flos": 490618156032.0, + "grad_norm": 0.07370263777730128, + "language_loss": 0.87101096, + "learning_rate": 0.0009029278654587462, + "loss": 0.88299322, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.18017578, + "step": 1173, + "time_per_iteration": 2.627659559249878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207558, + "balance_loss_mlp": 1.1888895, + "epoch": 0.22585609849942284, + "flos": 604616887296.0, + "grad_norm": 0.09375965630696953, + "language_loss": 0.82013619, + "learning_rate": 0.0009027433193361548, + "loss": 0.83221173, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.18652344, + "step": 1174, + "time_per_iteration": 2.8188316822052 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191442, + "balance_loss_mlp": 1.17263079, + "epoch": 0.22604848018468643, + "flos": 635568892416.0, + "grad_norm": 0.09826481383842127, + "language_loss": 0.8677392, + "learning_rate": 0.00090255861685474, + "loss": 0.87965363, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.18798828, + "step": 1175, + "time_per_iteration": 2.7677559852600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187652, + "balance_loss_mlp": 1.16895974, + "epoch": 0.22624086186995, + "flos": 479875000320.0, + "grad_norm": 0.09211807586067215, + "language_loss": 0.90504396, + "learning_rate": 0.0009023737580862095, + "loss": 0.91692042, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.18676758, + "step": 1176, + "time_per_iteration": 2.54901123046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01191354, + "balance_loss_mlp": 1.17276883, + "epoch": 0.22643324355521355, + "flos": 495814265856.0, + "grad_norm": 0.0881916579324479, + "language_loss": 0.83226693, + "learning_rate": 0.0009021887431023321, + "loss": 0.84418046, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.18566895, + "step": 1177, + "time_per_iteration": 2.6121795177459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174332, + "balance_loss_mlp": 1.15594959, + "epoch": 0.2266256252404771, + "flos": 561552860160.0, + "grad_norm": 0.08194623484888001, + "language_loss": 0.87241113, + "learning_rate": 0.0009020035719749369, + "loss": 0.88415444, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.18359375, + "step": 1178, + "time_per_iteration": 2.7401885986328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158606, + "balance_loss_mlp": 1.14040256, + "epoch": 0.22681800692574067, + "flos": 579688399872.0, + "grad_norm": 0.0813633568079927, + "language_loss": 0.77680194, + "learning_rate": 0.0009018182447759136, + "loss": 0.78838801, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.18212891, + "step": 1179, + "time_per_iteration": 3.0078771114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145135, + "balance_loss_mlp": 1.12688398, + "epoch": 0.22701038861100423, + "flos": 740166170112.0, + "grad_norm": 0.09172856476896407, + "language_loss": 0.79547179, + "learning_rate": 0.0009016327615772126, + "loss": 0.80692315, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.18249512, + "step": 1180, + "time_per_iteration": 2.956892251968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140365, + "balance_loss_mlp": 1.12199533, + "epoch": 0.2272027702962678, + "flos": 577257560064.0, + "grad_norm": 0.0875125644607483, + "language_loss": 0.87631428, + "learning_rate": 0.0009014471224508451, + "loss": 0.8877179, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.18359375, + "step": 1181, + "time_per_iteration": 2.6819214820861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140649, + "balance_loss_mlp": 1.12244546, + "epoch": 0.22739515198153135, + "flos": 544267123200.0, + "grad_norm": 0.12040521041324766, + "language_loss": 0.82781821, + "learning_rate": 0.0009012613274688823, + "loss": 0.8392247, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.18200684, + "step": 1182, + "time_per_iteration": 2.6545872688293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127803, + "balance_loss_mlp": 1.10971928, + "epoch": 0.22758753366679493, + "flos": 440163942912.0, + "grad_norm": 0.11611648539449336, + "language_loss": 0.87670434, + "learning_rate": 0.0009010753767034565, + "loss": 0.88798231, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.1809082, + "step": 1183, + "time_per_iteration": 2.5755655765533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011456, + "balance_loss_mlp": 1.12726605, + "epoch": 0.2277799153520585, + "flos": 729447607296.0, + "grad_norm": 0.07779286107938752, + "language_loss": 0.78790247, + "learning_rate": 0.0009008892702267599, + "loss": 0.79935843, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.18347168, + "step": 1184, + "time_per_iteration": 2.9940855503082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145741, + "balance_loss_mlp": 1.12732279, + "epoch": 0.22797229703732205, + "flos": 526894751232.0, + "grad_norm": 0.09447672073297446, + "language_loss": 0.88500011, + "learning_rate": 0.0009007030081110457, + "loss": 0.89645755, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.18408203, + "step": 1185, + "time_per_iteration": 2.6603288650512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143533, + "balance_loss_mlp": 1.12500811, + "epoch": 0.2281646787225856, + "flos": 535431347712.0, + "grad_norm": 0.0853307601225198, + "language_loss": 0.84380877, + "learning_rate": 0.000900516590428627, + "loss": 0.85524416, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.18518066, + "step": 1186, + "time_per_iteration": 2.692070484161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141181, + "balance_loss_mlp": 1.12318015, + "epoch": 0.22835706040784917, + "flos": 541381086720.0, + "grad_norm": 0.07243217971015652, + "language_loss": 0.89009422, + "learning_rate": 0.0009003300172518778, + "loss": 0.90150601, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.17980957, + "step": 1187, + "time_per_iteration": 2.7073988914489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.11980963, + "epoch": 0.22854944209311273, + "flos": 790637635584.0, + "grad_norm": 0.08424899879196017, + "language_loss": 0.83985436, + "learning_rate": 0.0009001432886532321, + "loss": 0.85122764, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.17529297, + "step": 1188, + "time_per_iteration": 2.9843039512634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146183, + "balance_loss_mlp": 1.12812281, + "epoch": 0.2287418237783763, + "flos": 469280148480.0, + "grad_norm": 0.0771143581641096, + "language_loss": 0.8654418, + "learning_rate": 0.0008999564047051843, + "loss": 0.87690365, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.18054199, + "step": 1189, + "time_per_iteration": 2.6047263145446777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152979, + "balance_loss_mlp": 1.13572931, + "epoch": 0.22893420546363985, + "flos": 468029850624.0, + "grad_norm": 0.0974051284777214, + "language_loss": 0.85100305, + "learning_rate": 0.0008997693654802894, + "loss": 0.86253285, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.17272949, + "step": 1190, + "time_per_iteration": 2.6849515438079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134691, + "balance_loss_mlp": 1.11709571, + "epoch": 0.22912658714890344, + "flos": 626258843136.0, + "grad_norm": 0.08474903758704144, + "language_loss": 0.86204302, + "learning_rate": 0.0008995821710511625, + "loss": 0.87338996, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.17602539, + "step": 1191, + "time_per_iteration": 2.742478132247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126818, + "balance_loss_mlp": 1.10922277, + "epoch": 0.229318968834167, + "flos": 503031555072.0, + "grad_norm": 0.08571505564163927, + "language_loss": 0.84842807, + "learning_rate": 0.0008993948214904786, + "loss": 0.85969627, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.17602539, + "step": 1192, + "time_per_iteration": 2.6361818313598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045247, + "balance_loss_mlp": 1.03237247, + "epoch": 0.22951135051943056, + "flos": 1374827613696.0, + "grad_norm": 0.028329103864080232, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79467458, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.12890625, + "step": 1193, + "time_per_iteration": 4.969930171966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112876, + "balance_loss_mlp": 1.10972273, + "epoch": 0.22970373220469412, + "flos": 644345197056.0, + "grad_norm": 0.08612491826756107, + "language_loss": 0.78059292, + "learning_rate": 0.0008990196572654427, + "loss": 0.79188055, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.19018555, + "step": 1194, + "time_per_iteration": 2.8844966888427734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140316, + "balance_loss_mlp": 1.12217188, + "epoch": 0.22989611388995768, + "flos": 500209758720.0, + "grad_norm": 0.10153558100200434, + "language_loss": 0.87920988, + "learning_rate": 0.0008988318427467426, + "loss": 0.89061302, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.18151855, + "step": 1195, + "time_per_iteration": 2.687624931335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142082, + "balance_loss_mlp": 1.12412882, + "epoch": 0.23008849557522124, + "flos": 1096522790400.0, + "grad_norm": 0.08230259672194101, + "language_loss": 0.86206847, + "learning_rate": 0.0008986438733877887, + "loss": 0.87348932, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.17956543, + "step": 1196, + "time_per_iteration": 3.4957938194274902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153338, + "balance_loss_mlp": 1.13559973, + "epoch": 0.2302808772604848, + "flos": 683648418816.0, + "grad_norm": 0.06895925957333625, + "language_loss": 0.8397938, + "learning_rate": 0.0008984557492615576, + "loss": 0.85132712, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.1776123, + "step": 1197, + "time_per_iteration": 3.004096031188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148862, + "balance_loss_mlp": 1.13082576, + "epoch": 0.23047325894574835, + "flos": 528923271168.0, + "grad_norm": 0.07382939590065767, + "language_loss": 0.89479733, + "learning_rate": 0.0008982674704410854, + "loss": 0.906286, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.18029785, + "step": 1198, + "time_per_iteration": 2.6988983154296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115166, + "balance_loss_mlp": 1.13448238, + "epoch": 0.23066564063101191, + "flos": 682766309376.0, + "grad_norm": 0.0949037059675448, + "language_loss": 0.77658606, + "learning_rate": 0.0008980790369994682, + "loss": 0.78810263, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.17199707, + "step": 1199, + "time_per_iteration": 2.9618003368377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154837, + "balance_loss_mlp": 1.13739705, + "epoch": 0.2308580223162755, + "flos": 558523662336.0, + "grad_norm": 0.07145246308543461, + "language_loss": 0.87144834, + "learning_rate": 0.000897890449009863, + "loss": 0.88299668, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.17443848, + "step": 1200, + "time_per_iteration": 2.7796213626861572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116547, + "balance_loss_mlp": 1.14776802, + "epoch": 0.23105040400153906, + "flos": 555669932544.0, + "grad_norm": 0.09854596236312584, + "language_loss": 0.89783561, + "learning_rate": 0.0008977017065454853, + "loss": 0.90949035, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.17712402, + "step": 1201, + "time_per_iteration": 2.7383389472961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118456, + "balance_loss_mlp": 1.16748941, + "epoch": 0.23124278568680262, + "flos": 704788936704.0, + "grad_norm": 0.06681897447915772, + "language_loss": 0.79928529, + "learning_rate": 0.0008975128096796121, + "loss": 0.81113094, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.17077637, + "step": 1202, + "time_per_iteration": 2.893461227416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174856, + "balance_loss_mlp": 1.15766644, + "epoch": 0.23143516737206618, + "flos": 612768043008.0, + "grad_norm": 0.09321616984993739, + "language_loss": 0.85471004, + "learning_rate": 0.0008973237584855794, + "loss": 0.86645865, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.17211914, + "step": 1203, + "time_per_iteration": 2.898749589920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174851, + "balance_loss_mlp": 1.15761375, + "epoch": 0.23162754905732974, + "flos": 389242796544.0, + "grad_norm": 0.08459599639125864, + "language_loss": 0.82237399, + "learning_rate": 0.0008971345530367832, + "loss": 0.83412254, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.17248535, + "step": 1204, + "time_per_iteration": 2.5461792945861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169858, + "balance_loss_mlp": 1.15260816, + "epoch": 0.2318199307425933, + "flos": 667778535936.0, + "grad_norm": 0.08050630983240942, + "language_loss": 0.85032547, + "learning_rate": 0.0008969451934066799, + "loss": 0.86202407, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.17272949, + "step": 1205, + "time_per_iteration": 2.8455100059509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157737, + "balance_loss_mlp": 1.1401062, + "epoch": 0.23201231242785686, + "flos": 666399757824.0, + "grad_norm": 0.09118158600793376, + "language_loss": 0.79779387, + "learning_rate": 0.0008967556796687854, + "loss": 0.80937129, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.1763916, + "step": 1206, + "time_per_iteration": 2.977187395095825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166868, + "balance_loss_mlp": 1.14940381, + "epoch": 0.23220469411312042, + "flos": 748816565760.0, + "grad_norm": 0.08470401629761377, + "language_loss": 0.83790028, + "learning_rate": 0.0008965660118966752, + "loss": 0.8495689, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.17480469, + "step": 1207, + "time_per_iteration": 2.9695510864257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164887, + "balance_loss_mlp": 1.14745879, + "epoch": 0.232397075798384, + "flos": 667061982720.0, + "grad_norm": 0.07067711449707674, + "language_loss": 0.89920551, + "learning_rate": 0.0008963761901639851, + "loss": 0.9108544, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.17443848, + "step": 1208, + "time_per_iteration": 2.8432528972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164904, + "balance_loss_mlp": 1.14763093, + "epoch": 0.23258945748364757, + "flos": 610218261504.0, + "grad_norm": 0.07998084189671781, + "language_loss": 0.83062428, + "learning_rate": 0.0008961862145444103, + "loss": 0.84227335, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.17285156, + "step": 1209, + "time_per_iteration": 2.7639503479003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161441, + "balance_loss_mlp": 1.14392972, + "epoch": 0.23278183916891113, + "flos": 489651982848.0, + "grad_norm": 0.07404933879866919, + "language_loss": 0.85019284, + "learning_rate": 0.0008959960851117059, + "loss": 0.86180723, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.17541504, + "step": 1210, + "time_per_iteration": 2.639765739440918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142071, + "balance_loss_mlp": 1.12463081, + "epoch": 0.23297422085417469, + "flos": 511585403904.0, + "grad_norm": 0.06764705739880358, + "language_loss": 0.83661717, + "learning_rate": 0.0008958058019396868, + "loss": 0.8480379, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.17468262, + "step": 1211, + "time_per_iteration": 2.8551721572875977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114749, + "balance_loss_mlp": 1.13016868, + "epoch": 0.23316660253943824, + "flos": 546421552128.0, + "grad_norm": 0.08875501668915448, + "language_loss": 0.86489981, + "learning_rate": 0.0008956153651022274, + "loss": 0.87637472, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.17333984, + "step": 1212, + "time_per_iteration": 2.7765469551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144101, + "balance_loss_mlp": 1.12625563, + "epoch": 0.2333589842247018, + "flos": 510256184832.0, + "grad_norm": 0.07932001584083075, + "language_loss": 0.83832914, + "learning_rate": 0.0008954247746732618, + "loss": 0.84977019, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.17858887, + "step": 1213, + "time_per_iteration": 2.6084651947021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135394, + "balance_loss_mlp": 1.11788201, + "epoch": 0.23355136590996536, + "flos": 663148104192.0, + "grad_norm": 0.07442615591494516, + "language_loss": 0.90398782, + "learning_rate": 0.0008952340307267837, + "loss": 0.91534173, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.17529297, + "step": 1214, + "time_per_iteration": 2.89178466796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125335, + "balance_loss_mlp": 1.10793078, + "epoch": 0.23374374759522892, + "flos": 508457461248.0, + "grad_norm": 0.1453012637227399, + "language_loss": 0.8336947, + "learning_rate": 0.0008950431333368468, + "loss": 0.84494805, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.17419434, + "step": 1215, + "time_per_iteration": 2.5870306491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111701, + "balance_loss_mlp": 1.09912825, + "epoch": 0.2339361292804925, + "flos": 1294455499776.0, + "grad_norm": 0.07975417299664793, + "language_loss": 0.84537351, + "learning_rate": 0.0008948520825775634, + "loss": 0.8565436, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.17919922, + "step": 1216, + "time_per_iteration": 3.6591601371765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111106, + "balance_loss_mlp": 1.0930953, + "epoch": 0.23412851096575607, + "flos": 705928006656.0, + "grad_norm": 0.083699003973451, + "language_loss": 0.83777452, + "learning_rate": 0.0008946608785231067, + "loss": 0.84888518, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.1796875, + "step": 1217, + "time_per_iteration": 2.910045862197876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122887, + "balance_loss_mlp": 1.10500622, + "epoch": 0.23432089265101963, + "flos": 438263903232.0, + "grad_norm": 0.07421571727754571, + "language_loss": 0.8465637, + "learning_rate": 0.0008944695212477084, + "loss": 0.85779262, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.17871094, + "step": 1218, + "time_per_iteration": 2.524942636489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136819, + "balance_loss_mlp": 1.11900902, + "epoch": 0.2345132743362832, + "flos": 480939918336.0, + "grad_norm": 0.08988714641466837, + "language_loss": 0.85843921, + "learning_rate": 0.0008942780108256599, + "loss": 0.86980736, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.17822266, + "step": 1219, + "time_per_iteration": 2.638685703277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122459, + "balance_loss_mlp": 1.10441041, + "epoch": 0.23470565602154675, + "flos": 411453001728.0, + "grad_norm": 0.09147837202786416, + "language_loss": 0.86524791, + "learning_rate": 0.0008940863473313121, + "loss": 0.87647247, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.18054199, + "step": 1220, + "time_per_iteration": 2.5017247200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141789, + "balance_loss_mlp": 1.12406206, + "epoch": 0.2348980377068103, + "flos": 545450609664.0, + "grad_norm": 0.08221984397196716, + "language_loss": 0.87834692, + "learning_rate": 0.0008938945308390756, + "loss": 0.88976479, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.17724609, + "step": 1221, + "time_per_iteration": 2.663565158843994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145313, + "balance_loss_mlp": 1.1284095, + "epoch": 0.23509041939207387, + "flos": 575740389888.0, + "grad_norm": 0.07596551545175816, + "language_loss": 0.86929715, + "learning_rate": 0.00089370256142342, + "loss": 0.88075024, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.16918945, + "step": 1222, + "time_per_iteration": 2.7675375938415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143, + "balance_loss_mlp": 1.12577403, + "epoch": 0.23528280107733743, + "flos": 588843177984.0, + "grad_norm": 0.07111090095827391, + "language_loss": 0.84719163, + "learning_rate": 0.0008935104391588746, + "loss": 0.8586216, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.17248535, + "step": 1223, + "time_per_iteration": 2.7930641174316406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141088, + "balance_loss_mlp": 1.12308729, + "epoch": 0.235475182762601, + "flos": 823328893440.0, + "grad_norm": 0.09172313762061536, + "language_loss": 0.83210915, + "learning_rate": 0.0008933181641200276, + "loss": 0.84352005, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.18005371, + "step": 1224, + "time_per_iteration": 3.184723138809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113861, + "balance_loss_mlp": 1.1213243, + "epoch": 0.23566756444786457, + "flos": 680164770816.0, + "grad_norm": 0.08544958772393396, + "language_loss": 0.85490656, + "learning_rate": 0.0008931257363815271, + "loss": 0.86629266, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.1730957, + "step": 1225, + "time_per_iteration": 2.9049925804138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116947, + "balance_loss_mlp": 1.09978044, + "epoch": 0.23585994613312813, + "flos": 701811495936.0, + "grad_norm": 0.08572157059192624, + "language_loss": 0.8983537, + "learning_rate": 0.0008929331560180798, + "loss": 0.90952325, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.171875, + "step": 1226, + "time_per_iteration": 2.976716995239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119433, + "balance_loss_mlp": 1.10198092, + "epoch": 0.2360523278183917, + "flos": 524176842240.0, + "grad_norm": 0.07629670414533757, + "language_loss": 0.90995669, + "learning_rate": 0.0008927404231044525, + "loss": 0.92115104, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.17468262, + "step": 1227, + "time_per_iteration": 2.754908561706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103828, + "balance_loss_mlp": 1.08611393, + "epoch": 0.23624470950365525, + "flos": 524310091776.0, + "grad_norm": 0.07882349010207228, + "language_loss": 0.81471217, + "learning_rate": 0.0008925475377154703, + "loss": 0.82575047, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.17736816, + "step": 1228, + "time_per_iteration": 2.7809646129608154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100869, + "balance_loss_mlp": 1.08254623, + "epoch": 0.2364370911889188, + "flos": 596811525120.0, + "grad_norm": 0.07142925877548961, + "language_loss": 0.82040304, + "learning_rate": 0.0008923544999260183, + "loss": 0.83141172, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.18322754, + "step": 1229, + "time_per_iteration": 2.760239362716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110144, + "balance_loss_mlp": 1.09266782, + "epoch": 0.23662947287418237, + "flos": 756849153024.0, + "grad_norm": 0.12387153159230253, + "language_loss": 0.91337013, + "learning_rate": 0.00089216130981104, + "loss": 0.92447156, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.17480469, + "step": 1230, + "time_per_iteration": 3.121588945388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110904, + "balance_loss_mlp": 1.09090781, + "epoch": 0.23682185455944593, + "flos": 546167162880.0, + "grad_norm": 0.07661504881361146, + "language_loss": 0.82228827, + "learning_rate": 0.000891967967445539, + "loss": 0.83337867, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.18139648, + "step": 1231, + "time_per_iteration": 2.7672059535980225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109921, + "balance_loss_mlp": 1.0920639, + "epoch": 0.2370142362447095, + "flos": 662285818368.0, + "grad_norm": 0.054732650189263314, + "language_loss": 0.88646662, + "learning_rate": 0.0008917744729045772, + "loss": 0.89756578, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.17871094, + "step": 1232, + "time_per_iteration": 2.9028637409210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104934, + "balance_loss_mlp": 1.08743405, + "epoch": 0.23720661792997308, + "flos": 683670813696.0, + "grad_norm": 0.08391850168433768, + "language_loss": 0.83650339, + "learning_rate": 0.0008915808262632757, + "loss": 0.84755272, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.1751709, + "step": 1233, + "time_per_iteration": 2.870555877685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123449, + "balance_loss_mlp": 1.10509062, + "epoch": 0.23739899961523664, + "flos": 558909103104.0, + "grad_norm": 0.09539034143195195, + "language_loss": 0.92907977, + "learning_rate": 0.0008913870275968148, + "loss": 0.94031429, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.18359375, + "step": 1234, + "time_per_iteration": 2.7251648902893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109776, + "balance_loss_mlp": 1.09154916, + "epoch": 0.2375913813005002, + "flos": 889546904064.0, + "grad_norm": 0.06697050939505883, + "language_loss": 0.87199342, + "learning_rate": 0.0008911930769804342, + "loss": 0.88309121, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.18237305, + "step": 1235, + "time_per_iteration": 3.268287420272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124889, + "balance_loss_mlp": 1.10593486, + "epoch": 0.23778376298576376, + "flos": 641120707584.0, + "grad_norm": 0.08058060241162714, + "language_loss": 0.91074061, + "learning_rate": 0.0008909989744894318, + "loss": 0.92198944, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.1895752, + "step": 1236, + "time_per_iteration": 2.8918802738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118206, + "balance_loss_mlp": 1.10007429, + "epoch": 0.23797614467102732, + "flos": 616820313600.0, + "grad_norm": 0.11301283658583765, + "language_loss": 0.81326294, + "learning_rate": 0.0008908047201991649, + "loss": 0.82444501, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.18127441, + "step": 1237, + "time_per_iteration": 2.8053224086761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111628, + "balance_loss_mlp": 1.09433031, + "epoch": 0.23816852635629088, + "flos": 624245004288.0, + "grad_norm": 0.0928222329851358, + "language_loss": 0.86241579, + "learning_rate": 0.0008906103141850502, + "loss": 0.87353206, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.17321777, + "step": 1238, + "time_per_iteration": 2.90500545501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117636, + "balance_loss_mlp": 1.09980249, + "epoch": 0.23836090804155444, + "flos": 521431769088.0, + "grad_norm": 0.08449694721293455, + "language_loss": 0.87626004, + "learning_rate": 0.0008904157565225621, + "loss": 0.88743639, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.17834473, + "step": 1239, + "time_per_iteration": 2.687969923019409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126339, + "balance_loss_mlp": 1.10839748, + "epoch": 0.238553289726818, + "flos": 1153991660544.0, + "grad_norm": 0.08713278777958322, + "language_loss": 0.815947, + "learning_rate": 0.000890221047287235, + "loss": 0.82721043, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.17944336, + "step": 1240, + "time_per_iteration": 3.531710386276245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134139, + "balance_loss_mlp": 1.11636496, + "epoch": 0.23874567141208156, + "flos": 499861393920.0, + "grad_norm": 0.07670600064189544, + "language_loss": 0.90527886, + "learning_rate": 0.0008900261865546615, + "loss": 0.91662019, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.17797852, + "step": 1241, + "time_per_iteration": 2.6662704944610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152535, + "balance_loss_mlp": 1.13414097, + "epoch": 0.23893805309734514, + "flos": 556934911488.0, + "grad_norm": 0.12487758336027797, + "language_loss": 0.84415132, + "learning_rate": 0.0008898311744004936, + "loss": 0.85567665, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.18408203, + "step": 1242, + "time_per_iteration": 2.763388156890869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149998, + "balance_loss_mlp": 1.13165212, + "epoch": 0.2391304347826087, + "flos": 549270512640.0, + "grad_norm": 0.06740377455140158, + "language_loss": 0.86921692, + "learning_rate": 0.0008896360109004414, + "loss": 0.88071686, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.18359375, + "step": 1243, + "time_per_iteration": 2.6441633701324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140632, + "balance_loss_mlp": 1.12121248, + "epoch": 0.23932281646787226, + "flos": 516050279424.0, + "grad_norm": 0.09575659644731266, + "language_loss": 0.84275168, + "learning_rate": 0.0008894406961302742, + "loss": 0.85415804, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.1940918, + "step": 1244, + "time_per_iteration": 2.6425938606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112564, + "balance_loss_mlp": 1.10582733, + "epoch": 0.23951519815313582, + "flos": 743682124800.0, + "grad_norm": 0.07353599262773654, + "language_loss": 0.83287829, + "learning_rate": 0.0008892452301658201, + "loss": 0.84413469, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.19799805, + "step": 1245, + "time_per_iteration": 2.9552412033081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105353, + "balance_loss_mlp": 1.08604133, + "epoch": 0.23970757983839938, + "flos": 554118257664.0, + "grad_norm": 0.06971047839699994, + "language_loss": 0.83254242, + "learning_rate": 0.0008890496130829653, + "loss": 0.84359598, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.19287109, + "step": 1246, + "time_per_iteration": 2.714538812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094484, + "balance_loss_mlp": 1.07490993, + "epoch": 0.23989996152366294, + "flos": 480655793664.0, + "grad_norm": 0.07160128232814054, + "language_loss": 0.85448045, + "learning_rate": 0.0008888538449576555, + "loss": 0.86542535, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.19567871, + "step": 1247, + "time_per_iteration": 2.5854134559631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081471, + "balance_loss_mlp": 1.06212282, + "epoch": 0.2400923432089265, + "flos": 485310818304.0, + "grad_norm": 0.10364601092251456, + "language_loss": 0.82938588, + "learning_rate": 0.0008886579258658944, + "loss": 0.84020054, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.1932373, + "step": 1248, + "time_per_iteration": 2.56381893157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085911, + "balance_loss_mlp": 1.06643224, + "epoch": 0.24028472489419006, + "flos": 623555615232.0, + "grad_norm": 0.11636637674492897, + "language_loss": 0.84617007, + "learning_rate": 0.0008884618558837446, + "loss": 0.8570292, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.19470215, + "step": 1249, + "time_per_iteration": 2.8670427799224854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092713, + "balance_loss_mlp": 1.07287669, + "epoch": 0.24047710657945365, + "flos": 601602370560.0, + "grad_norm": 0.09934462101700196, + "language_loss": 0.86105502, + "learning_rate": 0.0008882656350873273, + "loss": 0.87198216, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.19836426, + "step": 1250, + "time_per_iteration": 2.9198391437530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095988, + "balance_loss_mlp": 1.07702184, + "epoch": 0.2406694882647172, + "flos": 841558781952.0, + "grad_norm": 0.10386778667644601, + "language_loss": 0.86847913, + "learning_rate": 0.0008880692635528219, + "loss": 0.879439, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.1895752, + "step": 1251, + "time_per_iteration": 3.114600658416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108272, + "balance_loss_mlp": 1.08975875, + "epoch": 0.24086186994998077, + "flos": 527057736192.0, + "grad_norm": 0.09512533379834028, + "language_loss": 0.89605117, + "learning_rate": 0.0008878727413564669, + "loss": 0.90713388, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.18518066, + "step": 1252, + "time_per_iteration": 2.7784321308135986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_mlp": 1.0333159, + "epoch": 0.24105425163524433, + "flos": 1338261378048.0, + "grad_norm": 0.02598255704274824, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81180501, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.11572266, + "step": 1253, + "time_per_iteration": 4.945368528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142164, + "balance_loss_mlp": 1.12338829, + "epoch": 0.24124663332050789, + "flos": 614102404608.0, + "grad_norm": 0.08359922246859781, + "language_loss": 0.78146553, + "learning_rate": 0.0008874792452834528, + "loss": 0.79288721, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.18774414, + "step": 1254, + "time_per_iteration": 2.765700340270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144466, + "balance_loss_mlp": 1.12684703, + "epoch": 0.24143901500577145, + "flos": 575540328960.0, + "grad_norm": 0.08184252001830684, + "language_loss": 0.87274945, + "learning_rate": 0.0008872822715595626, + "loss": 0.88419414, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.17626953, + "step": 1255, + "time_per_iteration": 2.687319040298462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141993, + "balance_loss_mlp": 1.12460077, + "epoch": 0.241631396691035, + "flos": 495181776384.0, + "grad_norm": 0.10883062221863066, + "language_loss": 0.86691022, + "learning_rate": 0.0008870851474793598, + "loss": 0.87833017, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.17419434, + "step": 1256, + "time_per_iteration": 2.6231887340545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136562, + "balance_loss_mlp": 1.11930037, + "epoch": 0.24182377837629856, + "flos": 636191470080.0, + "grad_norm": 0.08915320009922777, + "language_loss": 0.89053321, + "learning_rate": 0.0008868878731193752, + "loss": 0.90189886, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.17285156, + "step": 1257, + "time_per_iteration": 2.928931713104248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113174, + "balance_loss_mlp": 1.11484766, + "epoch": 0.24201616006156215, + "flos": 515219927040.0, + "grad_norm": 0.08262742442990392, + "language_loss": 0.89427495, + "learning_rate": 0.0008866904485561973, + "loss": 0.90559232, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.16906738, + "step": 1258, + "time_per_iteration": 2.7494447231292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136898, + "balance_loss_mlp": 1.11986327, + "epoch": 0.2422085417468257, + "flos": 615144927744.0, + "grad_norm": 0.08559449998713918, + "language_loss": 0.82794583, + "learning_rate": 0.000886492873866473, + "loss": 0.83931482, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.17053223, + "step": 1259, + "time_per_iteration": 2.841770648956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112569, + "balance_loss_mlp": 1.10853612, + "epoch": 0.24240092343208927, + "flos": 585794156544.0, + "grad_norm": 0.12665734927529698, + "language_loss": 0.8437835, + "learning_rate": 0.000886295149126908, + "loss": 0.85504043, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.17163086, + "step": 1260, + "time_per_iteration": 2.7847495079040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119489, + "balance_loss_mlp": 1.10270476, + "epoch": 0.24259330511735283, + "flos": 762257806848.0, + "grad_norm": 0.13276121908066757, + "language_loss": 0.85482794, + "learning_rate": 0.0008860972744142655, + "loss": 0.86602283, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.16796875, + "step": 1261, + "time_per_iteration": 2.9415853023529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117567, + "balance_loss_mlp": 1.10078192, + "epoch": 0.2427856868026164, + "flos": 626878849536.0, + "grad_norm": 0.09469206100439348, + "language_loss": 0.81489432, + "learning_rate": 0.0008858992498053671, + "loss": 0.82606995, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.16796875, + "step": 1262, + "time_per_iteration": 2.8460397720336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058087, + "balance_loss_mlp": 1.04578424, + "epoch": 0.24297806848787995, + "flos": 1511653985280.0, + "grad_norm": 0.030096600393216412, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.7764684, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.12304688, + "step": 1263, + "time_per_iteration": 4.891434192657471 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164356, + "balance_loss_mlp": 1.14685583, + "epoch": 0.2431704501731435, + "flos": 541949336064.0, + "grad_norm": 0.07687362244804527, + "language_loss": 0.83471984, + "learning_rate": 0.0008855027512063817, + "loss": 0.84636343, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.17504883, + "step": 1264, + "time_per_iteration": 2.729905843734741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188155, + "balance_loss_mlp": 1.17034483, + "epoch": 0.24336283185840707, + "flos": 523845729792.0, + "grad_norm": 0.10565566639423048, + "language_loss": 0.85338992, + "learning_rate": 0.0008853042773702292, + "loss": 0.86527145, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.17810059, + "step": 1265, + "time_per_iteration": 2.7027270793914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213519, + "balance_loss_mlp": 1.19497013, + "epoch": 0.24355521354367063, + "flos": 537111502848.0, + "grad_norm": 0.10310511352597752, + "language_loss": 0.87869942, + "learning_rate": 0.0008851056539456896, + "loss": 0.89083463, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.1854248, + "step": 1266, + "time_per_iteration": 2.7062103748321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01190822, + "balance_loss_mlp": 1.17235637, + "epoch": 0.24374759522893422, + "flos": 930461271552.0, + "grad_norm": 0.106198185782814, + "language_loss": 0.81649381, + "learning_rate": 0.0008849068810098755, + "loss": 0.82840204, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.18469238, + "step": 1267, + "time_per_iteration": 3.329357862472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169809, + "balance_loss_mlp": 1.15086627, + "epoch": 0.24393997691419778, + "flos": 427787619840.0, + "grad_norm": 0.11133940138273103, + "language_loss": 0.82717752, + "learning_rate": 0.0008847079586399575, + "loss": 0.83887565, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.18945312, + "step": 1268, + "time_per_iteration": 2.558319091796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131294, + "balance_loss_mlp": 1.11318588, + "epoch": 0.24413235859946134, + "flos": 578853651456.0, + "grad_norm": 0.08817279245044941, + "language_loss": 0.85679001, + "learning_rate": 0.0008845088869131641, + "loss": 0.86810291, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.18103027, + "step": 1269, + "time_per_iteration": 2.692885637283325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122646, + "balance_loss_mlp": 1.10412109, + "epoch": 0.2443247402847249, + "flos": 529859708928.0, + "grad_norm": 0.07664646159291034, + "language_loss": 0.88602984, + "learning_rate": 0.0008843096659067818, + "loss": 0.89725631, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.18505859, + "step": 1270, + "time_per_iteration": 2.688197374343872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117697, + "balance_loss_mlp": 1.09989929, + "epoch": 0.24451712196998845, + "flos": 696321349632.0, + "grad_norm": 0.06543357243765746, + "language_loss": 0.86065173, + "learning_rate": 0.000884110295698155, + "loss": 0.87182868, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.17822266, + "step": 1271, + "time_per_iteration": 2.9497103691101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113614, + "balance_loss_mlp": 1.09520805, + "epoch": 0.24470950365525201, + "flos": 529832544768.0, + "grad_norm": 0.10345235518870362, + "language_loss": 0.85674417, + "learning_rate": 0.0008839107763646861, + "loss": 0.86788034, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.18395996, + "step": 1272, + "time_per_iteration": 2.6293063163757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111354, + "balance_loss_mlp": 1.09307909, + "epoch": 0.24490188534051557, + "flos": 491342049792.0, + "grad_norm": 0.0866440520117465, + "language_loss": 0.90339661, + "learning_rate": 0.0008837111079838353, + "loss": 0.91451013, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.18273926, + "step": 1273, + "time_per_iteration": 2.7676210403442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112497, + "balance_loss_mlp": 1.10732698, + "epoch": 0.24509426702577913, + "flos": 474155057664.0, + "grad_norm": 0.08933257913148762, + "language_loss": 0.89889824, + "learning_rate": 0.000883511290633121, + "loss": 0.91014791, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.17651367, + "step": 1274, + "time_per_iteration": 2.5634043216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111162, + "balance_loss_mlp": 1.09361923, + "epoch": 0.24528664871104272, + "flos": 550592391168.0, + "grad_norm": 0.08498045219099847, + "language_loss": 0.92045552, + "learning_rate": 0.000883311324390119, + "loss": 0.93157172, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.18005371, + "step": 1275, + "time_per_iteration": 2.688175678253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117687, + "balance_loss_mlp": 1.09850657, + "epoch": 0.24547903039630628, + "flos": 825903641088.0, + "grad_norm": 0.093400697768974, + "language_loss": 0.81587857, + "learning_rate": 0.0008831112093324629, + "loss": 0.82705545, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.19177246, + "step": 1276, + "time_per_iteration": 3.0782830715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120052, + "balance_loss_mlp": 1.10156226, + "epoch": 0.24567141208156984, + "flos": 591598162944.0, + "grad_norm": 0.07571489376575821, + "language_loss": 0.88611054, + "learning_rate": 0.0008829109455378444, + "loss": 0.89731109, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.18481445, + "step": 1277, + "time_per_iteration": 2.7325568199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130651, + "balance_loss_mlp": 1.11251891, + "epoch": 0.2458637937668334, + "flos": 547874482176.0, + "grad_norm": 0.08746979241051268, + "language_loss": 0.86345637, + "learning_rate": 0.000882710533084013, + "loss": 0.87476289, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.18139648, + "step": 1278, + "time_per_iteration": 2.647641658782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113502, + "balance_loss_mlp": 1.11687636, + "epoch": 0.24605617545209696, + "flos": 515894635008.0, + "grad_norm": 0.0699906863373026, + "language_loss": 0.89239269, + "learning_rate": 0.0008825099720487755, + "loss": 0.90374291, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.18164062, + "step": 1279, + "time_per_iteration": 2.647472858428955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108592, + "balance_loss_mlp": 1.07490551, + "epoch": 0.24624855713736052, + "flos": 1511772553728.0, + "grad_norm": 0.04364177649541596, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76347059, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.11035156, + "step": 1280, + "time_per_iteration": 4.876530647277832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056171, + "balance_loss_mlp": 1.04515576, + "epoch": 0.24644093882262408, + "flos": 1527608305152.0, + "grad_norm": 0.029948837084711404, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.79000282, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.11035156, + "step": 1281, + "time_per_iteration": 4.817251205444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130582, + "balance_loss_mlp": 1.11283183, + "epoch": 0.24663332050788764, + "flos": 659118228480.0, + "grad_norm": 0.0778912228408071, + "language_loss": 0.89449739, + "learning_rate": 0.0008819073982335619, + "loss": 0.9058032, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.17773438, + "step": 1282, + "time_per_iteration": 2.849764823913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139737, + "balance_loss_mlp": 1.12209415, + "epoch": 0.24682570219315123, + "flos": 541769098752.0, + "grad_norm": 0.06136900444292705, + "language_loss": 0.84456879, + "learning_rate": 0.0008817062436519235, + "loss": 0.85596615, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.17651367, + "step": 1283, + "time_per_iteration": 2.662811040878296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126818, + "balance_loss_mlp": 1.10860264, + "epoch": 0.24701808387841478, + "flos": 440695116288.0, + "grad_norm": 0.11946768082571088, + "language_loss": 0.895989, + "learning_rate": 0.0008815049408787788, + "loss": 0.90725714, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.18212891, + "step": 1284, + "time_per_iteration": 2.5498671531677246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118472, + "balance_loss_mlp": 1.10030437, + "epoch": 0.24721046556367834, + "flos": 468066926592.0, + "grad_norm": 0.07911934764568136, + "language_loss": 0.85533321, + "learning_rate": 0.0008813034899922805, + "loss": 0.86651796, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.1817627, + "step": 1285, + "time_per_iteration": 2.546613931655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112528, + "balance_loss_mlp": 1.10687399, + "epoch": 0.2474028472489419, + "flos": 504427585536.0, + "grad_norm": 0.09325179905503529, + "language_loss": 0.89224762, + "learning_rate": 0.0008811018910706387, + "loss": 0.90350044, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.18395996, + "step": 1286, + "time_per_iteration": 2.5715928077697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124504, + "balance_loss_mlp": 1.10582423, + "epoch": 0.24759522893420546, + "flos": 479956492800.0, + "grad_norm": 0.08651255320330896, + "language_loss": 0.81603038, + "learning_rate": 0.0008809001441921211, + "loss": 0.82727551, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.18688965, + "step": 1287, + "time_per_iteration": 2.76352858543396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116455, + "balance_loss_mlp": 1.09800124, + "epoch": 0.24778761061946902, + "flos": 533706776064.0, + "grad_norm": 0.07934964537800443, + "language_loss": 0.85291266, + "learning_rate": 0.0008806982494350528, + "loss": 0.86407721, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.18457031, + "step": 1288, + "time_per_iteration": 2.6464178562164307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125382, + "balance_loss_mlp": 1.10674942, + "epoch": 0.24797999230473258, + "flos": 559798553088.0, + "grad_norm": 0.07889330448691204, + "language_loss": 0.89930373, + "learning_rate": 0.0008804962068778161, + "loss": 0.91055757, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.18615723, + "step": 1289, + "time_per_iteration": 2.8725006580352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123355, + "balance_loss_mlp": 1.10481799, + "epoch": 0.24817237398999614, + "flos": 624225180672.0, + "grad_norm": 0.09114492679937135, + "language_loss": 0.80640042, + "learning_rate": 0.0008802940165988511, + "loss": 0.81763393, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.18530273, + "step": 1290, + "time_per_iteration": 2.9053151607513428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113226, + "balance_loss_mlp": 1.11324596, + "epoch": 0.2483647556752597, + "flos": 612281286144.0, + "grad_norm": 0.07850606096458997, + "language_loss": 0.88298845, + "learning_rate": 0.000880091678676655, + "loss": 0.89431107, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.18981934, + "step": 1291, + "time_per_iteration": 2.8338379859924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115571, + "balance_loss_mlp": 1.09697485, + "epoch": 0.2485571373605233, + "flos": 583553092608.0, + "grad_norm": 0.0792961220184265, + "language_loss": 0.89043152, + "learning_rate": 0.0008798891931897821, + "loss": 0.90158725, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.18579102, + "step": 1292, + "time_per_iteration": 2.7769196033477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121528, + "balance_loss_mlp": 1.10277641, + "epoch": 0.24874951904578685, + "flos": 494749347840.0, + "grad_norm": 0.0746346978796093, + "language_loss": 0.84222198, + "learning_rate": 0.0008796865602168447, + "loss": 0.8534373, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.18737793, + "step": 1293, + "time_per_iteration": 2.560858964920044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115627, + "balance_loss_mlp": 1.09803176, + "epoch": 0.2489419007310504, + "flos": 456174789120.0, + "grad_norm": 0.06740604853273545, + "language_loss": 0.88270545, + "learning_rate": 0.0008794837798365115, + "loss": 0.89386165, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.17614746, + "step": 1294, + "time_per_iteration": 2.6477129459381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125631, + "balance_loss_mlp": 1.10763049, + "epoch": 0.24913428241631397, + "flos": 485471232000.0, + "grad_norm": 0.0873145111764115, + "language_loss": 0.88408256, + "learning_rate": 0.0008792808521275089, + "loss": 0.89533883, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.18017578, + "step": 1295, + "time_per_iteration": 2.7224135398864746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121076, + "balance_loss_mlp": 1.10262191, + "epoch": 0.24932666410157753, + "flos": 518906580480.0, + "grad_norm": 0.0692696283298791, + "language_loss": 0.87340117, + "learning_rate": 0.0008790777771686206, + "loss": 0.88461185, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.18444824, + "step": 1296, + "time_per_iteration": 2.61126446723938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113013, + "balance_loss_mlp": 1.09509635, + "epoch": 0.2495190457868411, + "flos": 472603382784.0, + "grad_norm": 0.07573373752967896, + "language_loss": 0.84983516, + "learning_rate": 0.0008788745550386872, + "loss": 0.86096525, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.17932129, + "step": 1297, + "time_per_iteration": 2.573880672454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09876418, + "epoch": 0.24971142747210465, + "flos": 745886112768.0, + "grad_norm": 0.10171762649266601, + "language_loss": 0.797032, + "learning_rate": 0.0008786711858166063, + "loss": 0.80820251, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.18286133, + "step": 1298, + "time_per_iteration": 2.9712767601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123606, + "balance_loss_mlp": 1.10497391, + "epoch": 0.2499038091573682, + "flos": 749557711872.0, + "grad_norm": 0.0822091876784568, + "language_loss": 0.83161783, + "learning_rate": 0.0008784676695813332, + "loss": 0.8428539, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.1862793, + "step": 1299, + "time_per_iteration": 2.966691017150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129878, + "balance_loss_mlp": 1.11144853, + "epoch": 0.2500961908426318, + "flos": 745060902912.0, + "grad_norm": 0.08080427389944742, + "language_loss": 0.84450245, + "learning_rate": 0.0008782640064118796, + "loss": 0.85580122, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.18408203, + "step": 1300, + "time_per_iteration": 2.92551589012146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240263, + "balance_loss_mlp": 1.22471797, + "epoch": 0.2502885725278953, + "flos": 1417424334336.0, + "grad_norm": 0.06645546985774646, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77425015, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.15527344, + "step": 1301, + "time_per_iteration": 4.9493842124938965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114228, + "balance_loss_mlp": 1.12376654, + "epoch": 0.2504809542131589, + "flos": 515215157760.0, + "grad_norm": 0.09006790660725612, + "language_loss": 0.8623417, + "learning_rate": 0.0008778562395867648, + "loss": 0.87376451, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.18518066, + "step": 1302, + "time_per_iteration": 2.635500907897949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122782, + "balance_loss_mlp": 1.10403061, + "epoch": 0.25067333589842244, + "flos": 525819921408.0, + "grad_norm": 0.07479626477523657, + "language_loss": 0.83630598, + "learning_rate": 0.0008776521360894127, + "loss": 0.84753382, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.1875, + "step": 1303, + "time_per_iteration": 2.640951156616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090617, + "balance_loss_mlp": 1.07707512, + "epoch": 0.25086571758368603, + "flos": 1473897295872.0, + "grad_norm": 0.0418328343897397, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.80052686, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13574219, + "step": 1304, + "time_per_iteration": 4.842891454696655 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104198, + "balance_loss_mlp": 1.08618569, + "epoch": 0.2510580992689496, + "flos": 528382185984.0, + "grad_norm": 0.0798377990367126, + "language_loss": 0.90237606, + "learning_rate": 0.0008772434893213186, + "loss": 0.91341805, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.18017578, + "step": 1305, + "time_per_iteration": 2.6264374256134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097469, + "balance_loss_mlp": 1.07925391, + "epoch": 0.25125048095421315, + "flos": 517446309888.0, + "grad_norm": 0.07815304176143087, + "language_loss": 0.84344316, + "learning_rate": 0.0008770389462092276, + "loss": 0.85441786, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.18225098, + "step": 1306, + "time_per_iteration": 2.6599185466766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093714, + "balance_loss_mlp": 1.07480729, + "epoch": 0.25144286263947674, + "flos": 620462177280.0, + "grad_norm": 0.08248282915226902, + "language_loss": 0.86642498, + "learning_rate": 0.0008768342567176357, + "loss": 0.87736213, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.18908691, + "step": 1307, + "time_per_iteration": 2.919123411178589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094095, + "balance_loss_mlp": 1.07524765, + "epoch": 0.25163524432474027, + "flos": 503799865344.0, + "grad_norm": 0.07892434793160769, + "language_loss": 0.90316761, + "learning_rate": 0.0008766294209260107, + "loss": 0.91410857, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.18859863, + "step": 1308, + "time_per_iteration": 2.703994035720825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093703, + "balance_loss_mlp": 1.07496333, + "epoch": 0.25182762601000386, + "flos": 509072698368.0, + "grad_norm": 0.09325948106778781, + "language_loss": 0.9126637, + "learning_rate": 0.0008764244389138767, + "loss": 0.92360079, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.18725586, + "step": 1309, + "time_per_iteration": 2.6175687313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092261, + "balance_loss_mlp": 1.07365251, + "epoch": 0.2520200076952674, + "flos": 633896077824.0, + "grad_norm": 0.10626806402083949, + "language_loss": 0.81772095, + "learning_rate": 0.000876219310760815, + "loss": 0.82864356, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.18603516, + "step": 1310, + "time_per_iteration": 2.8659133911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097983, + "balance_loss_mlp": 1.07988715, + "epoch": 0.252212389380531, + "flos": 494638119936.0, + "grad_norm": 0.13076548306856256, + "language_loss": 0.81004, + "learning_rate": 0.0008760140365464631, + "loss": 0.82101983, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.18103027, + "step": 1311, + "time_per_iteration": 2.646810531616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120372, + "balance_loss_mlp": 1.10276532, + "epoch": 0.2524047710657945, + "flos": 490544004096.0, + "grad_norm": 0.11580551837549759, + "language_loss": 0.87203217, + "learning_rate": 0.0008758086163505156, + "loss": 0.88323587, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.17626953, + "step": 1312, + "time_per_iteration": 2.601256847381592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135664, + "balance_loss_mlp": 1.11779475, + "epoch": 0.2525971527510581, + "flos": 647431294464.0, + "grad_norm": 0.0666103465279768, + "language_loss": 0.89063561, + "learning_rate": 0.0008756030502527239, + "loss": 0.90199232, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.17883301, + "step": 1313, + "time_per_iteration": 2.8330187797546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161369, + "balance_loss_mlp": 1.14360678, + "epoch": 0.2527895344363217, + "flos": 569266818048.0, + "grad_norm": 0.0708022330446315, + "language_loss": 0.90153992, + "learning_rate": 0.0008753973383328954, + "loss": 0.91315365, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.17785645, + "step": 1314, + "time_per_iteration": 2.685375928878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011545, + "balance_loss_mlp": 1.13647509, + "epoch": 0.2529819161215852, + "flos": 514048923648.0, + "grad_norm": 0.08974334028560671, + "language_loss": 0.83722651, + "learning_rate": 0.0008751914806708952, + "loss": 0.84877157, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.18029785, + "step": 1315, + "time_per_iteration": 2.6155343055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164677, + "balance_loss_mlp": 1.14708161, + "epoch": 0.2531742978068488, + "flos": 531253168128.0, + "grad_norm": 0.08978858583773926, + "language_loss": 0.81837153, + "learning_rate": 0.0008749854773466439, + "loss": 0.83001828, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.17614746, + "step": 1316, + "time_per_iteration": 2.7219769954681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163056, + "balance_loss_mlp": 1.14553261, + "epoch": 0.25336667949211233, + "flos": 596638628352.0, + "grad_norm": 0.07528804981442601, + "language_loss": 0.8451466, + "learning_rate": 0.0008747793284401192, + "loss": 0.85677719, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.17541504, + "step": 1317, + "time_per_iteration": 2.7144973278045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151322, + "balance_loss_mlp": 1.13359582, + "epoch": 0.2535590611773759, + "flos": 602061963264.0, + "grad_norm": 0.08898497659473818, + "language_loss": 0.85280555, + "learning_rate": 0.0008745730340313551, + "loss": 0.86431873, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.17736816, + "step": 1318, + "time_per_iteration": 2.7930002212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13595057, + "epoch": 0.25375144286263945, + "flos": 495327508992.0, + "grad_norm": 0.08370435102239727, + "language_loss": 0.84217906, + "learning_rate": 0.0008743665942004422, + "loss": 0.85371482, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.1763916, + "step": 1319, + "time_per_iteration": 2.68245530128479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160638, + "balance_loss_mlp": 1.14311421, + "epoch": 0.25394382454790304, + "flos": 512470084608.0, + "grad_norm": 0.07392804364708638, + "language_loss": 0.92852235, + "learning_rate": 0.0008741600090275277, + "loss": 0.9401288, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.17529297, + "step": 1320, + "time_per_iteration": 2.5977306365966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163682, + "balance_loss_mlp": 1.14569294, + "epoch": 0.25413620623316663, + "flos": 959038589952.0, + "grad_norm": 0.10450079995548846, + "language_loss": 0.8392204, + "learning_rate": 0.0008739532785928151, + "loss": 0.8508572, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.17993164, + "step": 1321, + "time_per_iteration": 3.464723587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181344, + "balance_loss_mlp": 1.16827822, + "epoch": 0.25432858791843016, + "flos": 1577283922944.0, + "grad_norm": 0.05258117628035473, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.76074928, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.13085938, + "step": 1322, + "time_per_iteration": 4.845709562301636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194039, + "balance_loss_mlp": 1.17626476, + "epoch": 0.25452096960369375, + "flos": 583802712576.0, + "grad_norm": 0.080849834949414, + "language_loss": 0.83025825, + "learning_rate": 0.0008735393822590908, + "loss": 0.84219867, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.17785645, + "step": 1323, + "time_per_iteration": 2.7540626525878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204948, + "balance_loss_mlp": 1.18740082, + "epoch": 0.2547133512889573, + "flos": 508603193856.0, + "grad_norm": 0.08178952973842966, + "language_loss": 0.86670357, + "learning_rate": 0.0008733322165207681, + "loss": 0.87875307, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.17578125, + "step": 1324, + "time_per_iteration": 2.6596570014953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203339, + "balance_loss_mlp": 1.18555284, + "epoch": 0.25490573297422087, + "flos": 782619729408.0, + "grad_norm": 0.08051520692213045, + "language_loss": 0.82727516, + "learning_rate": 0.0008731249058420247, + "loss": 0.8393085, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.17810059, + "step": 1325, + "time_per_iteration": 3.082704782485962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197065, + "balance_loss_mlp": 1.17887366, + "epoch": 0.2550981146594844, + "flos": 509878084608.0, + "grad_norm": 0.07988786822753648, + "language_loss": 0.90256196, + "learning_rate": 0.0008729174503033459, + "loss": 0.9145326, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.18188477, + "step": 1326, + "time_per_iteration": 2.663212299346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163002, + "balance_loss_mlp": 1.14462042, + "epoch": 0.255290496344748, + "flos": 676673409024.0, + "grad_norm": 0.09140325585124401, + "language_loss": 0.82217562, + "learning_rate": 0.0008727098499852728, + "loss": 0.83380556, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.18383789, + "step": 1327, + "time_per_iteration": 2.859302520751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114294, + "balance_loss_mlp": 1.12451005, + "epoch": 0.2554828780300115, + "flos": 537815572992.0, + "grad_norm": 0.07316654776483361, + "language_loss": 0.89623642, + "learning_rate": 0.0008725021049684034, + "loss": 0.90766573, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.18432617, + "step": 1328, + "time_per_iteration": 2.7523410320281982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09832358, + "epoch": 0.2556752597152751, + "flos": 824186409984.0, + "grad_norm": 0.06969820691150284, + "language_loss": 0.82930326, + "learning_rate": 0.000872294215333391, + "loss": 0.84047389, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.18713379, + "step": 1329, + "time_per_iteration": 3.243213415145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108589, + "balance_loss_mlp": 1.08953917, + "epoch": 0.2558676414005387, + "flos": 570791328768.0, + "grad_norm": 0.08533282388950945, + "language_loss": 0.82889348, + "learning_rate": 0.0008720861811609457, + "loss": 0.83997935, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.19042969, + "step": 1330, + "time_per_iteration": 2.789504051208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086894, + "balance_loss_mlp": 1.06807089, + "epoch": 0.2560600230858022, + "flos": 486684453888.0, + "grad_norm": 0.08137535215054885, + "language_loss": 0.83645493, + "learning_rate": 0.0008718780025318338, + "loss": 0.84732389, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.18823242, + "step": 1331, + "time_per_iteration": 2.7668251991271973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092113, + "balance_loss_mlp": 1.07411242, + "epoch": 0.2562524047710658, + "flos": 513122397696.0, + "grad_norm": 0.08447566633159821, + "language_loss": 0.83860987, + "learning_rate": 0.0008716696795268771, + "loss": 0.84953099, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.18017578, + "step": 1332, + "time_per_iteration": 2.71281099319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088022, + "balance_loss_mlp": 1.06994987, + "epoch": 0.25644478645632934, + "flos": 634820032512.0, + "grad_norm": 0.08355917909814405, + "language_loss": 0.85442013, + "learning_rate": 0.0008714612122269538, + "loss": 0.8653003, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.1809082, + "step": 1333, + "time_per_iteration": 2.9077794551849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108256, + "balance_loss_mlp": 1.09015965, + "epoch": 0.25663716814159293, + "flos": 436591088640.0, + "grad_norm": 0.09490231540823739, + "language_loss": 0.89133245, + "learning_rate": 0.0008712526007129982, + "loss": 0.90241498, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.18103027, + "step": 1334, + "time_per_iteration": 2.5269079208374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127264, + "balance_loss_mlp": 1.10958493, + "epoch": 0.25682954982685646, + "flos": 498161415168.0, + "grad_norm": 0.09530184614586146, + "language_loss": 0.90164447, + "learning_rate": 0.0008710438450660003, + "loss": 0.91291702, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.17687988, + "step": 1335, + "time_per_iteration": 2.690424680709839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127744, + "balance_loss_mlp": 1.10994577, + "epoch": 0.25702193151212005, + "flos": 457701871104.0, + "grad_norm": 0.09938976745138839, + "language_loss": 0.87409496, + "learning_rate": 0.0008708349453670064, + "loss": 0.88537246, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.17810059, + "step": 1336, + "time_per_iteration": 2.5319509506225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128077, + "balance_loss_mlp": 1.10982585, + "epoch": 0.2572143131973836, + "flos": 598281707520.0, + "grad_norm": 0.08461134195014028, + "language_loss": 0.91159999, + "learning_rate": 0.0008706259016971185, + "loss": 0.92288077, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.18249512, + "step": 1337, + "time_per_iteration": 2.8355276584625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133843, + "balance_loss_mlp": 1.11533022, + "epoch": 0.25740669488264717, + "flos": 698308024320.0, + "grad_norm": 0.1004001057114973, + "language_loss": 0.82634485, + "learning_rate": 0.0008704167141374944, + "loss": 0.83768326, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.18518066, + "step": 1338, + "time_per_iteration": 2.83562970161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125326, + "balance_loss_mlp": 1.10650253, + "epoch": 0.25759907656791076, + "flos": 502379241984.0, + "grad_norm": 0.07535387519287148, + "language_loss": 0.87972409, + "learning_rate": 0.0008702073827693482, + "loss": 0.89097726, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.18823242, + "step": 1339, + "time_per_iteration": 2.7440268993377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121252, + "balance_loss_mlp": 1.10240531, + "epoch": 0.2577914582531743, + "flos": 773880500736.0, + "grad_norm": 0.07907705856450171, + "language_loss": 0.8856355, + "learning_rate": 0.0008699979076739494, + "loss": 0.89684802, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.18847656, + "step": 1340, + "time_per_iteration": 2.985356092453003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132949, + "balance_loss_mlp": 1.11369705, + "epoch": 0.2579838399384379, + "flos": 459666150912.0, + "grad_norm": 0.10358510275764175, + "language_loss": 0.88529009, + "learning_rate": 0.0008697882889326234, + "loss": 0.89661956, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.19238281, + "step": 1341, + "time_per_iteration": 2.564622163772583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136453, + "balance_loss_mlp": 1.11695075, + "epoch": 0.2581762216237014, + "flos": 569185325568.0, + "grad_norm": 0.09783747399550236, + "language_loss": 0.8651613, + "learning_rate": 0.0008695785266267515, + "loss": 0.87652576, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.19482422, + "step": 1342, + "time_per_iteration": 2.7061781883239746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147948, + "balance_loss_mlp": 1.12840939, + "epoch": 0.258368603308965, + "flos": 604201711104.0, + "grad_norm": 0.08416519118542358, + "language_loss": 0.83111393, + "learning_rate": 0.0008693686208377704, + "loss": 0.84259331, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.19543457, + "step": 1343, + "time_per_iteration": 2.8751444816589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150711, + "balance_loss_mlp": 1.13156581, + "epoch": 0.2585609849942285, + "flos": 491460618240.0, + "grad_norm": 0.07899493252865974, + "language_loss": 0.88980556, + "learning_rate": 0.0008691585716471733, + "loss": 0.90131271, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.19140625, + "step": 1344, + "time_per_iteration": 2.6969785690307617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159409, + "balance_loss_mlp": 1.14027607, + "epoch": 0.2587533666794921, + "flos": 640755090432.0, + "grad_norm": 0.06941419908962602, + "language_loss": 0.8544178, + "learning_rate": 0.0008689483791365079, + "loss": 0.86601192, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.19116211, + "step": 1345, + "time_per_iteration": 2.8562369346618652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154974, + "balance_loss_mlp": 1.13669968, + "epoch": 0.2589457483647557, + "flos": 576849724416.0, + "grad_norm": 0.07286553563097259, + "language_loss": 0.89186096, + "learning_rate": 0.0008687380433873786, + "loss": 0.90341073, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.18273926, + "step": 1346, + "time_per_iteration": 2.7854301929473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173599, + "balance_loss_mlp": 1.15573001, + "epoch": 0.25913813005001923, + "flos": 535424007168.0, + "grad_norm": 0.11357363401175323, + "language_loss": 0.82125735, + "learning_rate": 0.0008685275644814448, + "loss": 0.83299333, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.17883301, + "step": 1347, + "time_per_iteration": 2.6921608448028564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116629, + "balance_loss_mlp": 1.14855206, + "epoch": 0.2593305117352828, + "flos": 721039491072.0, + "grad_norm": 0.07639398633752482, + "language_loss": 0.8419714, + "learning_rate": 0.0008683169425004216, + "loss": 0.85363436, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.17773438, + "step": 1348, + "time_per_iteration": 2.9085500240325928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153795, + "balance_loss_mlp": 1.13597322, + "epoch": 0.25952289342054635, + "flos": 710096274432.0, + "grad_norm": 0.09519621553180321, + "language_loss": 0.8328886, + "learning_rate": 0.0008681061775260799, + "loss": 0.84442651, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.17834473, + "step": 1349, + "time_per_iteration": 2.8755290508270264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143034, + "balance_loss_mlp": 1.12578487, + "epoch": 0.25971527510580994, + "flos": 455920399872.0, + "grad_norm": 0.10298645875309809, + "language_loss": 0.92206728, + "learning_rate": 0.0008678952696402458, + "loss": 0.93349767, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.17260742, + "step": 1350, + "time_per_iteration": 2.530040740966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128339, + "balance_loss_mlp": 1.11113763, + "epoch": 0.25990765679107347, + "flos": 612528334848.0, + "grad_norm": 0.07054972097096389, + "language_loss": 0.85973078, + "learning_rate": 0.000867684218924801, + "loss": 0.87101424, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.17211914, + "step": 1351, + "time_per_iteration": 2.924776077270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135752, + "balance_loss_mlp": 1.12478447, + "epoch": 0.26010003847633706, + "flos": 1537963075584.0, + "grad_norm": 0.07057525744027235, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80082846, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.10986328, + "step": 1352, + "time_per_iteration": 4.937533378601074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127686, + "balance_loss_mlp": 1.11084199, + "epoch": 0.2602924201616006, + "flos": 716265897984.0, + "grad_norm": 0.06384913215279323, + "language_loss": 0.85261834, + "learning_rate": 0.0008672616893328834, + "loss": 0.86389524, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.1685791, + "step": 1353, + "time_per_iteration": 2.9442062377929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122527, + "balance_loss_mlp": 1.10589719, + "epoch": 0.2604848018468642, + "flos": 643529899008.0, + "grad_norm": 0.09199225792086613, + "language_loss": 0.90041292, + "learning_rate": 0.0008670502106204512, + "loss": 0.91163814, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.16638184, + "step": 1354, + "time_per_iteration": 2.840792417526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132378, + "balance_loss_mlp": 1.11488962, + "epoch": 0.26067718353212777, + "flos": 517033704960.0, + "grad_norm": 0.0749682309300763, + "language_loss": 0.81919277, + "learning_rate": 0.0008668385894064892, + "loss": 0.83051658, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.1751709, + "step": 1355, + "time_per_iteration": 2.649226665496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150444, + "balance_loss_mlp": 1.13379025, + "epoch": 0.2608695652173913, + "flos": 822733479936.0, + "grad_norm": 0.10108237113866697, + "language_loss": 0.89089942, + "learning_rate": 0.0008666268257731562, + "loss": 0.90240383, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.16662598, + "step": 1356, + "time_per_iteration": 3.1606926918029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152316, + "balance_loss_mlp": 1.13520908, + "epoch": 0.2610619469026549, + "flos": 1007850097152.0, + "grad_norm": 0.09285423546908722, + "language_loss": 0.85545158, + "learning_rate": 0.0008664149198026662, + "loss": 0.86697471, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.17126465, + "step": 1357, + "time_per_iteration": 3.286130428314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164462, + "balance_loss_mlp": 1.14699829, + "epoch": 0.2612543285879184, + "flos": 536782961664.0, + "grad_norm": 0.08517439685870379, + "language_loss": 0.88857412, + "learning_rate": 0.0008662028715772883, + "loss": 0.90021884, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.17480469, + "step": 1358, + "time_per_iteration": 2.6877803802490234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157352, + "balance_loss_mlp": 1.13951862, + "epoch": 0.261446710273182, + "flos": 519420501504.0, + "grad_norm": 0.08437519054308197, + "language_loss": 0.85356647, + "learning_rate": 0.0008659906811793467, + "loss": 0.86514002, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.1784668, + "step": 1359, + "time_per_iteration": 2.701963186264038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152325, + "balance_loss_mlp": 1.13483691, + "epoch": 0.26163909195844554, + "flos": 583259056128.0, + "grad_norm": 0.09516463994255123, + "language_loss": 0.89262813, + "learning_rate": 0.0008657783486912215, + "loss": 0.90415138, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.17504883, + "step": 1360, + "time_per_iteration": 2.7410097122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150671, + "balance_loss_mlp": 1.1330992, + "epoch": 0.2618314736437091, + "flos": 958762179072.0, + "grad_norm": 0.06828467212359378, + "language_loss": 0.8976928, + "learning_rate": 0.0008655658741953472, + "loss": 0.90919948, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.17590332, + "step": 1361, + "time_per_iteration": 3.2329330444335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138416, + "balance_loss_mlp": 1.12074876, + "epoch": 0.26202385532897265, + "flos": 574803952128.0, + "grad_norm": 0.06454511059104741, + "language_loss": 0.88249099, + "learning_rate": 0.0008653532577742136, + "loss": 0.89387512, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.17675781, + "step": 1362, + "time_per_iteration": 2.746363401412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139921, + "balance_loss_mlp": 1.12302947, + "epoch": 0.26221623701423624, + "flos": 445471280640.0, + "grad_norm": 0.07711827630070714, + "language_loss": 0.86794758, + "learning_rate": 0.0008651404995103659, + "loss": 0.87934673, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.16906738, + "step": 1363, + "time_per_iteration": 2.5565500259399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132814, + "balance_loss_mlp": 1.11538577, + "epoch": 0.26240861869949983, + "flos": 535718043648.0, + "grad_norm": 0.08155880386034024, + "language_loss": 0.8709327, + "learning_rate": 0.0008649275994864041, + "loss": 0.8822608, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.17431641, + "step": 1364, + "time_per_iteration": 2.716562032699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133153, + "balance_loss_mlp": 1.11586761, + "epoch": 0.26260100038476336, + "flos": 565249052160.0, + "grad_norm": 0.06672959076804742, + "language_loss": 0.83875144, + "learning_rate": 0.0008647145577849834, + "loss": 0.85008299, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.1730957, + "step": 1365, + "time_per_iteration": 2.8476812839508057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129924, + "balance_loss_mlp": 1.11255515, + "epoch": 0.26279338207002695, + "flos": 613059508224.0, + "grad_norm": 0.0668808093236692, + "language_loss": 0.82936931, + "learning_rate": 0.0008645013744888139, + "loss": 0.8406685, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.17382812, + "step": 1366, + "time_per_iteration": 2.891817092895508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127692, + "balance_loss_mlp": 1.11063313, + "epoch": 0.2629857637552905, + "flos": 522832568832.0, + "grad_norm": 0.08778385712395331, + "language_loss": 0.87274009, + "learning_rate": 0.0008642880496806607, + "loss": 0.88401705, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.17077637, + "step": 1367, + "time_per_iteration": 2.8053958415985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120237, + "balance_loss_mlp": 1.10274851, + "epoch": 0.26317814544055407, + "flos": 534549238272.0, + "grad_norm": 0.07681621031760291, + "language_loss": 0.84336966, + "learning_rate": 0.0008640745834433437, + "loss": 0.85457206, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.17504883, + "step": 1368, + "time_per_iteration": 2.787339925765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121438, + "balance_loss_mlp": 1.10430789, + "epoch": 0.2633705271258176, + "flos": 555543650304.0, + "grad_norm": 0.09521927305918056, + "language_loss": 0.86539549, + "learning_rate": 0.000863860975859738, + "loss": 0.87660992, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.17126465, + "step": 1369, + "time_per_iteration": 2.9646191596984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114699, + "balance_loss_mlp": 1.0977838, + "epoch": 0.2635629088110812, + "flos": 552401026560.0, + "grad_norm": 0.08138719928792186, + "language_loss": 0.87995172, + "learning_rate": 0.0008636472270127733, + "loss": 0.89109874, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.16918945, + "step": 1370, + "time_per_iteration": 2.646869421005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110661, + "balance_loss_mlp": 1.08878803, + "epoch": 0.2637552904963448, + "flos": 455984640000.0, + "grad_norm": 0.09119402348134849, + "language_loss": 0.90394557, + "learning_rate": 0.0008634333369854345, + "loss": 0.91501164, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.1784668, + "step": 1371, + "time_per_iteration": 2.630207061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101355, + "balance_loss_mlp": 1.083915, + "epoch": 0.2639476721816083, + "flos": 613128890880.0, + "grad_norm": 0.08212786438033774, + "language_loss": 0.87634504, + "learning_rate": 0.0008632193058607608, + "loss": 0.88735861, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.17456055, + "step": 1372, + "time_per_iteration": 2.7757019996643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114382, + "balance_loss_mlp": 1.09665525, + "epoch": 0.2641400538668719, + "flos": 571920486912.0, + "grad_norm": 0.10317877520485044, + "language_loss": 0.80747414, + "learning_rate": 0.0008630051337218466, + "loss": 0.81861794, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.17736816, + "step": 1373, + "time_per_iteration": 2.7459805011749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117051, + "balance_loss_mlp": 1.09961104, + "epoch": 0.2643324355521354, + "flos": 582251037696.0, + "grad_norm": 0.08099527295858751, + "language_loss": 0.82020557, + "learning_rate": 0.0008627908206518409, + "loss": 0.83137608, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.17456055, + "step": 1374, + "time_per_iteration": 2.719428300857544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113813, + "balance_loss_mlp": 1.12554145, + "epoch": 0.264524817237399, + "flos": 1544678926848.0, + "grad_norm": 0.042063102349752246, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76289386, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.12597656, + "step": 1375, + "time_per_iteration": 4.988332748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112187, + "balance_loss_mlp": 1.09442437, + "epoch": 0.26471719892266254, + "flos": 518034382848.0, + "grad_norm": 0.06812086657274741, + "language_loss": 0.91138768, + "learning_rate": 0.0008623617720514241, + "loss": 0.92250949, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.1776123, + "step": 1376, + "time_per_iteration": 2.644531726837158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109794, + "balance_loss_mlp": 1.09182918, + "epoch": 0.26490958060792613, + "flos": 517189349376.0, + "grad_norm": 0.0722091181333716, + "language_loss": 0.84490621, + "learning_rate": 0.0008621470366875848, + "loss": 0.85600418, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.1796875, + "step": 1377, + "time_per_iteration": 2.605417490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100816, + "balance_loss_mlp": 1.08375728, + "epoch": 0.26510196229318966, + "flos": 596574388224.0, + "grad_norm": 0.07263229866332392, + "language_loss": 0.87396085, + "learning_rate": 0.0008619321607257966, + "loss": 0.884969, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.17077637, + "step": 1378, + "time_per_iteration": 2.7229108810424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100855, + "balance_loss_mlp": 1.08392727, + "epoch": 0.26529434397845325, + "flos": 685800649728.0, + "grad_norm": 0.07341413806820511, + "language_loss": 0.82002622, + "learning_rate": 0.000861717144249482, + "loss": 0.83103478, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.16943359, + "step": 1379, + "time_per_iteration": 2.9031612873077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105487, + "balance_loss_mlp": 1.08884549, + "epoch": 0.26548672566371684, + "flos": 424353157632.0, + "grad_norm": 0.06987190342408907, + "language_loss": 0.89693463, + "learning_rate": 0.0008615019873421175, + "loss": 0.9079895, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.16650391, + "step": 1380, + "time_per_iteration": 2.5554280281066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105329, + "balance_loss_mlp": 1.08804345, + "epoch": 0.26567910734898037, + "flos": 489864526848.0, + "grad_norm": 0.07960659576711203, + "language_loss": 0.85129094, + "learning_rate": 0.0008612866900872349, + "loss": 0.86234426, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.17297363, + "step": 1381, + "time_per_iteration": 2.560756206512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115387, + "balance_loss_mlp": 1.0986619, + "epoch": 0.26587148903424396, + "flos": 534203444736.0, + "grad_norm": 0.10185032090542295, + "language_loss": 0.87836969, + "learning_rate": 0.0008610712525684197, + "loss": 0.88952351, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.1673584, + "step": 1382, + "time_per_iteration": 2.649127721786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.09392381, + "epoch": 0.2660638707195075, + "flos": 1017464094720.0, + "grad_norm": 0.09094270381931494, + "language_loss": 0.84048492, + "learning_rate": 0.0008608556748693121, + "loss": 0.85159665, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.17260742, + "step": 1383, + "time_per_iteration": 3.2573940753936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109064, + "balance_loss_mlp": 1.09163558, + "epoch": 0.2662562524047711, + "flos": 523981550592.0, + "grad_norm": 0.0818167871774861, + "language_loss": 0.859007, + "learning_rate": 0.000860639957073607, + "loss": 0.87009764, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.17443848, + "step": 1384, + "time_per_iteration": 2.7120518684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110901, + "balance_loss_mlp": 1.0937109, + "epoch": 0.2664486340900346, + "flos": 552381202944.0, + "grad_norm": 0.07681443511092155, + "language_loss": 0.87386912, + "learning_rate": 0.0008604240992650534, + "loss": 0.88497818, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.17211914, + "step": 1385, + "time_per_iteration": 2.69921612739563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113027, + "balance_loss_mlp": 1.09546757, + "epoch": 0.2666410157752982, + "flos": 470157115392.0, + "grad_norm": 0.06494344058238215, + "language_loss": 0.88934892, + "learning_rate": 0.0008602081015274545, + "loss": 0.9004792, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.17553711, + "step": 1386, + "time_per_iteration": 2.7353157997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117717, + "balance_loss_mlp": 1.10092068, + "epoch": 0.2668333974605617, + "flos": 569919131136.0, + "grad_norm": 0.06900257884101904, + "language_loss": 0.83328801, + "learning_rate": 0.0008599919639446684, + "loss": 0.8444652, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.16809082, + "step": 1387, + "time_per_iteration": 2.6927597522735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110335, + "balance_loss_mlp": 1.09289455, + "epoch": 0.2670257791458253, + "flos": 398982703104.0, + "grad_norm": 0.08338734757979376, + "language_loss": 0.79947424, + "learning_rate": 0.000859775686600607, + "loss": 0.81057751, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.17468262, + "step": 1388, + "time_per_iteration": 2.5740597248077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123987, + "balance_loss_mlp": 1.10719037, + "epoch": 0.2672181608310889, + "flos": 515847647232.0, + "grad_norm": 0.09984082638450108, + "language_loss": 0.84917498, + "learning_rate": 0.0008595592695792367, + "loss": 0.86041486, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.16809082, + "step": 1389, + "time_per_iteration": 2.6907854080200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112253, + "balance_loss_mlp": 1.10618591, + "epoch": 0.26741054251635243, + "flos": 507521023488.0, + "grad_norm": 0.06989103866242331, + "language_loss": 0.90147883, + "learning_rate": 0.0008593427129645778, + "loss": 0.91270411, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.16345215, + "step": 1390, + "time_per_iteration": 2.6145434379577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120602, + "balance_loss_mlp": 1.10381722, + "epoch": 0.267602924201616, + "flos": 576647092224.0, + "grad_norm": 0.07905482313842922, + "language_loss": 0.85086334, + "learning_rate": 0.0008591260168407052, + "loss": 0.86206937, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.16796875, + "step": 1391, + "time_per_iteration": 2.787076711654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117002, + "balance_loss_mlp": 1.10062313, + "epoch": 0.26779530588687955, + "flos": 523984121856.0, + "grad_norm": 0.0789554563697551, + "language_loss": 0.8226018, + "learning_rate": 0.0008589091812917479, + "loss": 0.83377182, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.16381836, + "step": 1392, + "time_per_iteration": 2.6753129959106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122652, + "balance_loss_mlp": 1.10604584, + "epoch": 0.26798768757214314, + "flos": 556771926528.0, + "grad_norm": 0.07614476371572584, + "language_loss": 0.84920317, + "learning_rate": 0.0008586922064018887, + "loss": 0.86042964, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.1661377, + "step": 1393, + "time_per_iteration": 2.716813325881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114509, + "balance_loss_mlp": 1.09750938, + "epoch": 0.2681800692574067, + "flos": 930614717952.0, + "grad_norm": 0.08000570031295028, + "language_loss": 0.89098954, + "learning_rate": 0.0008584750922553651, + "loss": 0.90213466, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.17016602, + "step": 1394, + "time_per_iteration": 3.1575980186462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121389, + "balance_loss_mlp": 1.10477114, + "epoch": 0.26837245094267026, + "flos": 701080261632.0, + "grad_norm": 0.0683134764251081, + "language_loss": 0.83357704, + "learning_rate": 0.0008582578389364677, + "loss": 0.84479094, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.16625977, + "step": 1395, + "time_per_iteration": 2.885806083679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127129, + "balance_loss_mlp": 1.10989153, + "epoch": 0.26856483262793385, + "flos": 593191683072.0, + "grad_norm": 0.08737379963197432, + "language_loss": 0.91578317, + "learning_rate": 0.0008580404465295422, + "loss": 0.92705452, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.17260742, + "step": 1396, + "time_per_iteration": 2.849519968032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135341, + "balance_loss_mlp": 1.1180197, + "epoch": 0.2687572143131974, + "flos": 714271882752.0, + "grad_norm": 0.08461023567525901, + "language_loss": 0.8857668, + "learning_rate": 0.0008578229151189876, + "loss": 0.89712024, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.17321777, + "step": 1397, + "time_per_iteration": 2.94858980178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127453, + "balance_loss_mlp": 1.10984576, + "epoch": 0.26894959599846097, + "flos": 467718561792.0, + "grad_norm": 0.12493178829468786, + "language_loss": 0.81211323, + "learning_rate": 0.0008576052447892573, + "loss": 0.82338774, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.17614746, + "step": 1398, + "time_per_iteration": 2.534120798110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135254, + "balance_loss_mlp": 1.1178261, + "epoch": 0.2691419776837245, + "flos": 468701987328.0, + "grad_norm": 0.06803844431236612, + "language_loss": 0.85910499, + "learning_rate": 0.000857387435624858, + "loss": 0.87045753, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.17456055, + "step": 1399, + "time_per_iteration": 2.554008960723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159537, + "balance_loss_mlp": 1.1418941, + "epoch": 0.2693343593689881, + "flos": 937651396608.0, + "grad_norm": 0.0815296826798993, + "language_loss": 0.87922233, + "learning_rate": 0.0008571694877103513, + "loss": 0.8908177, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.17663574, + "step": 1400, + "time_per_iteration": 3.2941367626190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173169, + "balance_loss_mlp": 1.15442979, + "epoch": 0.2695267410542516, + "flos": 577600782336.0, + "grad_norm": 0.09384983289618287, + "language_loss": 0.8761692, + "learning_rate": 0.0008569514011303515, + "loss": 0.88790089, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.1875, + "step": 1401, + "time_per_iteration": 2.814588785171509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157764, + "balance_loss_mlp": 1.1397872, + "epoch": 0.2697191227395152, + "flos": 556823683584.0, + "grad_norm": 0.09439986590001768, + "language_loss": 0.87801731, + "learning_rate": 0.0008567331759695277, + "loss": 0.88959491, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.17980957, + "step": 1402, + "time_per_iteration": 2.765251398086548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144715, + "balance_loss_mlp": 1.12577283, + "epoch": 0.26991150442477874, + "flos": 529281547776.0, + "grad_norm": 0.08321050634823257, + "language_loss": 0.85899508, + "learning_rate": 0.0008565148123126023, + "loss": 0.87044227, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.18933105, + "step": 1403, + "time_per_iteration": 2.7100989818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125241, + "balance_loss_mlp": 1.10733557, + "epoch": 0.2701038861100423, + "flos": 532006797312.0, + "grad_norm": 0.0728098596241797, + "language_loss": 0.86166966, + "learning_rate": 0.0008562963102443516, + "loss": 0.87292206, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.17907715, + "step": 1404, + "time_per_iteration": 2.7286291122436523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112919, + "balance_loss_mlp": 1.09493017, + "epoch": 0.2702962677953059, + "flos": 735227020800.0, + "grad_norm": 0.10158619193030523, + "language_loss": 0.84717911, + "learning_rate": 0.0008560776698496056, + "loss": 0.85830832, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.17993164, + "step": 1405, + "time_per_iteration": 2.9067912101745605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103079, + "balance_loss_mlp": 1.08472061, + "epoch": 0.27048864948056944, + "flos": 574761733632.0, + "grad_norm": 0.08020634125989436, + "language_loss": 0.85596079, + "learning_rate": 0.0008558588912132481, + "loss": 0.86699152, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.18359375, + "step": 1406, + "time_per_iteration": 2.880148410797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071193, + "balance_loss_mlp": 1.05955815, + "epoch": 0.27068103116583303, + "flos": 1423853489664.0, + "grad_norm": 0.03626473669965315, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77530181, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11621094, + "step": 1407, + "time_per_iteration": 4.905766487121582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087497, + "balance_loss_mlp": 1.06903148, + "epoch": 0.27087341285109656, + "flos": 531999456768.0, + "grad_norm": 0.0815781254437323, + "language_loss": 0.82643741, + "learning_rate": 0.0008554209195555016, + "loss": 0.83731234, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.18481445, + "step": 1408, + "time_per_iteration": 2.759427309036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086012, + "balance_loss_mlp": 1.06754613, + "epoch": 0.27106579453636015, + "flos": 581378840064.0, + "grad_norm": 0.08207637293966, + "language_loss": 0.87980115, + "learning_rate": 0.0008552017267041483, + "loss": 0.89066136, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.18457031, + "step": 1409, + "time_per_iteration": 2.71040678024292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088281, + "balance_loss_mlp": 1.06865954, + "epoch": 0.2712581762216237, + "flos": 506801899008.0, + "grad_norm": 0.0734300404961751, + "language_loss": 0.83141303, + "learning_rate": 0.0008549823959512549, + "loss": 0.84229583, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.19616699, + "step": 1410, + "time_per_iteration": 2.6883578300476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104836, + "balance_loss_mlp": 1.08663297, + "epoch": 0.27145055790688727, + "flos": 997442823168.0, + "grad_norm": 0.07342840956593329, + "language_loss": 0.86307788, + "learning_rate": 0.0008547629273819728, + "loss": 0.87412632, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.18212891, + "step": 1411, + "time_per_iteration": 3.4179537296295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110447, + "balance_loss_mlp": 1.09208882, + "epoch": 0.2716429395921508, + "flos": 546681083904.0, + "grad_norm": 0.07902892919535931, + "language_loss": 0.83264589, + "learning_rate": 0.0008545433210815074, + "loss": 0.84375036, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.18347168, + "step": 1412, + "time_per_iteration": 2.644336462020874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132524, + "balance_loss_mlp": 1.11396301, + "epoch": 0.2718353212774144, + "flos": 573225113088.0, + "grad_norm": 0.08239543530107682, + "language_loss": 0.87688351, + "learning_rate": 0.0008543235771351176, + "loss": 0.88820869, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.18554688, + "step": 1413, + "time_per_iteration": 2.7242777347564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140498, + "balance_loss_mlp": 1.12286687, + "epoch": 0.272027702962678, + "flos": 644305549824.0, + "grad_norm": 0.06292390757949942, + "language_loss": 0.84580851, + "learning_rate": 0.0008541036956281154, + "loss": 0.85721344, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.17651367, + "step": 1414, + "time_per_iteration": 2.917314052581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149081, + "balance_loss_mlp": 1.13212919, + "epoch": 0.2722200846479415, + "flos": 653726827008.0, + "grad_norm": 0.09608953935856007, + "language_loss": 0.81591362, + "learning_rate": 0.0008538836766458665, + "loss": 0.82740438, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.16967773, + "step": 1415, + "time_per_iteration": 2.8857710361480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115948, + "balance_loss_mlp": 1.14234948, + "epoch": 0.2724124663332051, + "flos": 579631873536.0, + "grad_norm": 0.09141970967130493, + "language_loss": 0.84791577, + "learning_rate": 0.0008536635202737897, + "loss": 0.85951054, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.17150879, + "step": 1416, + "time_per_iteration": 2.8404181003570557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168227, + "balance_loss_mlp": 1.15094137, + "epoch": 0.2726048480184686, + "flos": 537435274752.0, + "grad_norm": 0.08934509912200893, + "language_loss": 0.81624401, + "learning_rate": 0.0008534432265973573, + "loss": 0.82792622, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.1730957, + "step": 1417, + "time_per_iteration": 2.636125326156616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117943, + "balance_loss_mlp": 1.16220391, + "epoch": 0.2727972297037322, + "flos": 995797172736.0, + "grad_norm": 0.09636198633360953, + "language_loss": 0.87909538, + "learning_rate": 0.000853222795702095, + "loss": 0.89088964, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.17248535, + "step": 1418, + "time_per_iteration": 3.452954053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168866, + "balance_loss_mlp": 1.15174711, + "epoch": 0.27298961138899575, + "flos": 606205638144.0, + "grad_norm": 0.09586408952292569, + "language_loss": 0.83810413, + "learning_rate": 0.0008530022276735813, + "loss": 0.84979284, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.17138672, + "step": 1419, + "time_per_iteration": 2.74656081199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160265, + "balance_loss_mlp": 1.14302731, + "epoch": 0.27318199307425933, + "flos": 529325964288.0, + "grad_norm": 0.07361815357739941, + "language_loss": 0.8564744, + "learning_rate": 0.0008527815225974489, + "loss": 0.86807704, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.17260742, + "step": 1420, + "time_per_iteration": 2.6620352268218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161372, + "balance_loss_mlp": 1.14375329, + "epoch": 0.2733743747595229, + "flos": 409029129216.0, + "grad_norm": 0.10060729288286506, + "language_loss": 0.88312179, + "learning_rate": 0.0008525606805593829, + "loss": 0.89473552, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.17651367, + "step": 1421, + "time_per_iteration": 2.4528608322143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152179, + "balance_loss_mlp": 1.13429809, + "epoch": 0.27356675644478645, + "flos": 516225747456.0, + "grad_norm": 0.0906337737142573, + "language_loss": 0.82765526, + "learning_rate": 0.0008523397016451213, + "loss": 0.83917701, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.17895508, + "step": 1422, + "time_per_iteration": 2.611370086669922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146443, + "balance_loss_mlp": 1.12862146, + "epoch": 0.27375913813005004, + "flos": 1052342088192.0, + "grad_norm": 0.0675988615568281, + "language_loss": 0.86714458, + "learning_rate": 0.0008521185859404564, + "loss": 0.87860906, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.17822266, + "step": 1423, + "time_per_iteration": 3.4147353172302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127875, + "balance_loss_mlp": 1.11027932, + "epoch": 0.27395151981531357, + "flos": 624805913088.0, + "grad_norm": 0.10391013903512737, + "language_loss": 0.89233863, + "learning_rate": 0.0008518973335312326, + "loss": 0.90361738, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.17602539, + "step": 1424, + "time_per_iteration": 2.8380019664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131922, + "balance_loss_mlp": 1.11418414, + "epoch": 0.27414390150057716, + "flos": 550372506624.0, + "grad_norm": 0.08776572848910039, + "language_loss": 0.83471692, + "learning_rate": 0.0008516759445033477, + "loss": 0.8460362, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.1776123, + "step": 1425, + "time_per_iteration": 2.6492245197296143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148521, + "balance_loss_mlp": 1.13083041, + "epoch": 0.2743362831858407, + "flos": 539866487808.0, + "grad_norm": 0.09331893476455168, + "language_loss": 0.84960282, + "learning_rate": 0.0008514544189427526, + "loss": 0.86108804, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.17687988, + "step": 1426, + "time_per_iteration": 2.694824457168579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160501, + "balance_loss_mlp": 1.14289403, + "epoch": 0.2745286648711043, + "flos": 468590759424.0, + "grad_norm": 0.10058930784889258, + "language_loss": 0.86324757, + "learning_rate": 0.0008512327569354511, + "loss": 0.8748526, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.17602539, + "step": 1427, + "time_per_iteration": 2.5711381435394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170402, + "balance_loss_mlp": 1.15265131, + "epoch": 0.2747210465563678, + "flos": 472867683840.0, + "grad_norm": 0.08313733600620697, + "language_loss": 0.83505958, + "learning_rate": 0.0008510109585675001, + "loss": 0.84676361, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.17749023, + "step": 1428, + "time_per_iteration": 2.6291069984436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075627, + "balance_loss_mlp": 1.06465936, + "epoch": 0.2749134282416314, + "flos": 1315085372928.0, + "grad_norm": 0.04529042076604016, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82228971, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.10986328, + "step": 1429, + "time_per_iteration": 4.732970952987671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151608, + "balance_loss_mlp": 1.13460922, + "epoch": 0.275105809926895, + "flos": 970861718016.0, + "grad_norm": 0.10649873504882197, + "language_loss": 0.80186272, + "learning_rate": 0.0008505669530941415, + "loss": 0.81337881, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.17016602, + "step": 1430, + "time_per_iteration": 3.3425114154815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132102, + "balance_loss_mlp": 1.11454248, + "epoch": 0.2752981916121585, + "flos": 527344432128.0, + "grad_norm": 0.09668389067503143, + "language_loss": 0.83789647, + "learning_rate": 0.000850344746161112, + "loss": 0.84921753, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.17578125, + "step": 1431, + "time_per_iteration": 2.6212620735168457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115216, + "balance_loss_mlp": 1.09790659, + "epoch": 0.2754905732974221, + "flos": 453709071360.0, + "grad_norm": 0.07650346740070771, + "language_loss": 0.87718683, + "learning_rate": 0.0008501224032121894, + "loss": 0.88833898, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.17321777, + "step": 1432, + "time_per_iteration": 2.531632900238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099408, + "balance_loss_mlp": 1.0818007, + "epoch": 0.27568295498268564, + "flos": 497474597376.0, + "grad_norm": 0.07599019403635421, + "language_loss": 0.81644619, + "learning_rate": 0.0008498999243336946, + "loss": 0.82744026, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.17626953, + "step": 1433, + "time_per_iteration": 2.6577858924865723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108116, + "balance_loss_mlp": 1.09086609, + "epoch": 0.2758753366679492, + "flos": 608194510848.0, + "grad_norm": 0.08691171830183525, + "language_loss": 0.87290454, + "learning_rate": 0.0008496773096120021, + "loss": 0.8839857, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.17260742, + "step": 1434, + "time_per_iteration": 2.8218367099761963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103536, + "balance_loss_mlp": 1.08573806, + "epoch": 0.27606771835321275, + "flos": 740129094144.0, + "grad_norm": 0.09853984157164923, + "language_loss": 0.83996856, + "learning_rate": 0.0008494545591335381, + "loss": 0.85100389, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.17810059, + "step": 1435, + "time_per_iteration": 2.9297800064086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114552, + "balance_loss_mlp": 1.09671807, + "epoch": 0.27626010003847634, + "flos": 554572707840.0, + "grad_norm": 0.06137328591569865, + "language_loss": 0.86751276, + "learning_rate": 0.0008492316729847823, + "loss": 0.87865829, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.1784668, + "step": 1436, + "time_per_iteration": 2.8056235313415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111542, + "balance_loss_mlp": 1.09787273, + "epoch": 0.2764524817237399, + "flos": 542554661376.0, + "grad_norm": 0.08045565015071575, + "language_loss": 0.79808342, + "learning_rate": 0.0008490086512522664, + "loss": 0.8092376, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.17565918, + "step": 1437, + "time_per_iteration": 2.7486345767974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125236, + "balance_loss_mlp": 1.10653245, + "epoch": 0.27664486340900346, + "flos": 406246980096.0, + "grad_norm": 0.07152243392964944, + "language_loss": 0.90246308, + "learning_rate": 0.0008487854940225755, + "loss": 0.91371536, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.18701172, + "step": 1438, + "time_per_iteration": 2.45500111579895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119071, + "balance_loss_mlp": 1.10104609, + "epoch": 0.27683724509426705, + "flos": 522138410496.0, + "grad_norm": 0.12336147646099646, + "language_loss": 0.89520633, + "learning_rate": 0.0008485622013823466, + "loss": 0.9063971, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.18029785, + "step": 1439, + "time_per_iteration": 2.6394927501678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116899, + "balance_loss_mlp": 1.09899366, + "epoch": 0.2770296267795306, + "flos": 535349855232.0, + "grad_norm": 0.08970889576331396, + "language_loss": 0.83229852, + "learning_rate": 0.00084833877341827, + "loss": 0.84346747, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.17895508, + "step": 1440, + "time_per_iteration": 2.673386812210083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137485, + "balance_loss_mlp": 1.11953235, + "epoch": 0.27722200846479417, + "flos": 487991651328.0, + "grad_norm": 0.09818503582677594, + "language_loss": 0.8055383, + "learning_rate": 0.000848115210217088, + "loss": 0.81691313, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.17956543, + "step": 1441, + "time_per_iteration": 2.6129040718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143053, + "balance_loss_mlp": 1.12554169, + "epoch": 0.2774143901500577, + "flos": 618297836544.0, + "grad_norm": 0.08082573862086316, + "language_loss": 0.81372535, + "learning_rate": 0.0008478915118655952, + "loss": 0.82515597, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.17529297, + "step": 1442, + "time_per_iteration": 2.843041181564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150917, + "balance_loss_mlp": 1.13371468, + "epoch": 0.2776067718353213, + "flos": 513819127296.0, + "grad_norm": 0.07560665817061937, + "language_loss": 0.86043841, + "learning_rate": 0.0008476676784506393, + "loss": 0.87194753, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.17224121, + "step": 1443, + "time_per_iteration": 2.669281005859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145766, + "balance_loss_mlp": 1.12862349, + "epoch": 0.2777991535205848, + "flos": 1004395811328.0, + "grad_norm": 0.07357545068984293, + "language_loss": 0.81809199, + "learning_rate": 0.0008474437100591201, + "loss": 0.82954967, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.17150879, + "step": 1444, + "time_per_iteration": 3.32959246635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112957, + "balance_loss_mlp": 1.1127255, + "epoch": 0.2779915352058484, + "flos": 550278531072.0, + "grad_norm": 0.08256066258120752, + "language_loss": 0.85183853, + "learning_rate": 0.0008472196067779898, + "loss": 0.86313421, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.1685791, + "step": 1445, + "time_per_iteration": 2.6932947635650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011128, + "balance_loss_mlp": 1.09586096, + "epoch": 0.278183916891112, + "flos": 873798160896.0, + "grad_norm": 0.1350534130118882, + "language_loss": 0.85003686, + "learning_rate": 0.0008469953686942531, + "loss": 0.86116487, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.16955566, + "step": 1446, + "time_per_iteration": 3.0903265476226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122325, + "balance_loss_mlp": 1.10539699, + "epoch": 0.2783762985763755, + "flos": 624064766976.0, + "grad_norm": 0.09027465145753444, + "language_loss": 0.82766867, + "learning_rate": 0.0008467709958949668, + "loss": 0.83889192, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.16943359, + "step": 1447, + "time_per_iteration": 2.7486042976379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.1059382, + "epoch": 0.2785686802616391, + "flos": 581838432768.0, + "grad_norm": 0.08057262764159107, + "language_loss": 0.85942835, + "learning_rate": 0.0008465464884672403, + "loss": 0.87065423, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.16662598, + "step": 1448, + "time_per_iteration": 2.7239129543304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128319, + "balance_loss_mlp": 1.11145079, + "epoch": 0.27876106194690264, + "flos": 587333348352.0, + "grad_norm": 0.0722544104008292, + "language_loss": 0.85391676, + "learning_rate": 0.0008463218464982348, + "loss": 0.86520004, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.16882324, + "step": 1449, + "time_per_iteration": 2.824716329574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112804, + "balance_loss_mlp": 1.11102891, + "epoch": 0.27895344363216623, + "flos": 875982325248.0, + "grad_norm": 0.07814645269371487, + "language_loss": 0.8771199, + "learning_rate": 0.0008460970700751645, + "loss": 0.88840032, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.17016602, + "step": 1450, + "time_per_iteration": 3.1141586303710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126615, + "balance_loss_mlp": 1.10931802, + "epoch": 0.27914582531742976, + "flos": 603910245888.0, + "grad_norm": 0.07255444133213705, + "language_loss": 0.87776339, + "learning_rate": 0.000845872159285295, + "loss": 0.8890295, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.1730957, + "step": 1451, + "time_per_iteration": 2.739476442337036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085209, + "balance_loss_mlp": 1.07529104, + "epoch": 0.27933820700269335, + "flos": 1497738097152.0, + "grad_norm": 0.033234239085754465, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78852057, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.09912109, + "step": 1452, + "time_per_iteration": 4.910952806472778 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138861, + "balance_loss_mlp": 1.12121844, + "epoch": 0.2795305886879569, + "flos": 1031859025920.0, + "grad_norm": 0.10385775803237589, + "language_loss": 0.86136031, + "learning_rate": 0.0008454219349544836, + "loss": 0.87274891, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.17651367, + "step": 1453, + "time_per_iteration": 3.3671629428863525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121876, + "balance_loss_mlp": 1.10430491, + "epoch": 0.27972297037322047, + "flos": 607058012160.0, + "grad_norm": 0.07125574209855656, + "language_loss": 0.82064086, + "learning_rate": 0.000845196621588334, + "loss": 0.83185959, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.17602539, + "step": 1454, + "time_per_iteration": 2.775218963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125012, + "balance_loss_mlp": 1.107584, + "epoch": 0.27991535205848406, + "flos": 630380123136.0, + "grad_norm": 0.07195305251583452, + "language_loss": 0.7580061, + "learning_rate": 0.0008449711742049706, + "loss": 0.76925623, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.17443848, + "step": 1455, + "time_per_iteration": 2.785322427749634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120523, + "balance_loss_mlp": 1.10295129, + "epoch": 0.2801077337437476, + "flos": 549297676800.0, + "grad_norm": 0.08382647519260926, + "language_loss": 0.83480191, + "learning_rate": 0.0008447455928919196, + "loss": 0.84600711, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.17590332, + "step": 1456, + "time_per_iteration": 2.660736083984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119281, + "balance_loss_mlp": 1.10179305, + "epoch": 0.2803001154290112, + "flos": 486761177088.0, + "grad_norm": 0.0678890613230097, + "language_loss": 0.86596936, + "learning_rate": 0.0008445198777367595, + "loss": 0.87716216, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.17492676, + "step": 1457, + "time_per_iteration": 2.5753204822540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121467, + "balance_loss_mlp": 1.10389531, + "epoch": 0.2804924971142747, + "flos": 522074170368.0, + "grad_norm": 0.10986912551565038, + "language_loss": 0.80972993, + "learning_rate": 0.0008442940288271208, + "loss": 0.82094461, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.17578125, + "step": 1458, + "time_per_iteration": 2.641165018081665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112066, + "balance_loss_mlp": 1.10273051, + "epoch": 0.2806848787995383, + "flos": 527697566208.0, + "grad_norm": 0.06853525506838967, + "language_loss": 0.86948931, + "learning_rate": 0.0008440680462506856, + "loss": 0.88069594, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.17932129, + "step": 1459, + "time_per_iteration": 2.7613425254821777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115398, + "balance_loss_mlp": 1.09818411, + "epoch": 0.2808772604848018, + "flos": 485493626880.0, + "grad_norm": 0.07519938139917645, + "language_loss": 0.86463004, + "learning_rate": 0.0008438419300951883, + "loss": 0.87578404, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.17224121, + "step": 1460, + "time_per_iteration": 2.684657335281372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116837, + "balance_loss_mlp": 1.09928942, + "epoch": 0.2810696421700654, + "flos": 618139620864.0, + "grad_norm": 0.0687143759737579, + "language_loss": 0.86178434, + "learning_rate": 0.0008436156804484148, + "loss": 0.8729527, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.17565918, + "step": 1461, + "time_per_iteration": 2.860818386077881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111314, + "balance_loss_mlp": 1.09343266, + "epoch": 0.28126202385532895, + "flos": 454754165760.0, + "grad_norm": 0.11710518654826144, + "language_loss": 0.88180649, + "learning_rate": 0.0008433892973982031, + "loss": 0.89291972, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.17883301, + "step": 1462, + "time_per_iteration": 2.58311128616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106143, + "balance_loss_mlp": 1.08844042, + "epoch": 0.28145440554059253, + "flos": 530704742400.0, + "grad_norm": 0.07819154550189573, + "language_loss": 0.84951186, + "learning_rate": 0.0008431627810324431, + "loss": 0.86057329, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.17724609, + "step": 1463, + "time_per_iteration": 2.6800074577331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111406, + "balance_loss_mlp": 1.09443069, + "epoch": 0.2816467872258561, + "flos": 452228977152.0, + "grad_norm": 0.06467590099086191, + "language_loss": 0.81057346, + "learning_rate": 0.000842936131439076, + "loss": 0.82168752, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.16992188, + "step": 1464, + "time_per_iteration": 2.6747214794158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111707, + "balance_loss_mlp": 1.09463668, + "epoch": 0.28183916891111965, + "flos": 472712039424.0, + "grad_norm": 0.06943840277913271, + "language_loss": 0.87714398, + "learning_rate": 0.0008427093487060951, + "loss": 0.88826108, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.17089844, + "step": 1465, + "time_per_iteration": 2.6723203659057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113512, + "balance_loss_mlp": 1.09656, + "epoch": 0.28203155059638324, + "flos": 557053479936.0, + "grad_norm": 0.06709163317621891, + "language_loss": 0.846192, + "learning_rate": 0.000842482432921545, + "loss": 0.8573271, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.16955566, + "step": 1466, + "time_per_iteration": 2.8659911155700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104971, + "balance_loss_mlp": 1.0876503, + "epoch": 0.28222393228164677, + "flos": 416980224000.0, + "grad_norm": 0.07868097185173097, + "language_loss": 0.86230814, + "learning_rate": 0.0008422553841735225, + "loss": 0.87335783, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.17333984, + "step": 1467, + "time_per_iteration": 2.5069150924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109046, + "balance_loss_mlp": 1.09167767, + "epoch": 0.28241631396691036, + "flos": 604910923776.0, + "grad_norm": 0.07514750891429747, + "language_loss": 0.84737515, + "learning_rate": 0.0008420282025501757, + "loss": 0.85846567, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.17370605, + "step": 1468, + "time_per_iteration": 2.808751344680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094588, + "balance_loss_mlp": 1.07768393, + "epoch": 0.2826086956521739, + "flos": 572968152576.0, + "grad_norm": 0.0683968152950732, + "language_loss": 0.84884882, + "learning_rate": 0.0008418008881397043, + "loss": 0.85979474, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.16918945, + "step": 1469, + "time_per_iteration": 2.6929962635040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089051, + "balance_loss_mlp": 1.07267165, + "epoch": 0.2828010773374375, + "flos": 842756949504.0, + "grad_norm": 0.0720569823253329, + "language_loss": 0.82694614, + "learning_rate": 0.0008415734410303595, + "loss": 0.83783662, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.16381836, + "step": 1470, + "time_per_iteration": 3.2501566410064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095144, + "balance_loss_mlp": 1.07776332, + "epoch": 0.28299345902270107, + "flos": 542675801088.0, + "grad_norm": 0.07334017240809462, + "language_loss": 0.90763617, + "learning_rate": 0.0008413458613104444, + "loss": 0.91858757, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.17407227, + "step": 1471, + "time_per_iteration": 2.7336316108703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089358, + "balance_loss_mlp": 1.07198906, + "epoch": 0.2831858407079646, + "flos": 571606626816.0, + "grad_norm": 0.06835636483746928, + "language_loss": 0.82895148, + "learning_rate": 0.0008411181490683129, + "loss": 0.839845, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.17370605, + "step": 1472, + "time_per_iteration": 2.742314100265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085557, + "balance_loss_mlp": 1.0680809, + "epoch": 0.2833782223932282, + "flos": 763826734080.0, + "grad_norm": 0.08020623974692119, + "language_loss": 0.82316583, + "learning_rate": 0.0008408903043923707, + "loss": 0.83402139, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.17492676, + "step": 1473, + "time_per_iteration": 3.0307655334472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090161, + "balance_loss_mlp": 1.07230377, + "epoch": 0.2835706040784917, + "flos": 539051189760.0, + "grad_norm": 0.09874308222598177, + "language_loss": 0.81175971, + "learning_rate": 0.0008406623273710754, + "loss": 0.8226614, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.17858887, + "step": 1474, + "time_per_iteration": 2.6652164459228516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086824, + "balance_loss_mlp": 1.06919324, + "epoch": 0.2837629857637553, + "flos": 530593514496.0, + "grad_norm": 0.0806852987114514, + "language_loss": 0.82865691, + "learning_rate": 0.0008404342180929351, + "loss": 0.83952522, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.1763916, + "step": 1475, + "time_per_iteration": 2.676020622253418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085303, + "balance_loss_mlp": 1.06739831, + "epoch": 0.28395536744901884, + "flos": 540032044032.0, + "grad_norm": 0.0807623151432505, + "language_loss": 0.81497931, + "learning_rate": 0.00084020597664651, + "loss": 0.82583237, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.17907715, + "step": 1476, + "time_per_iteration": 2.8055877685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087711, + "balance_loss_mlp": 1.06957936, + "epoch": 0.2841477491342824, + "flos": 573635146752.0, + "grad_norm": 0.09698913749719028, + "language_loss": 0.83786356, + "learning_rate": 0.0008399776031204111, + "loss": 0.8487407, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.18139648, + "step": 1477, + "time_per_iteration": 2.7545149326324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087702, + "balance_loss_mlp": 1.06898642, + "epoch": 0.28434013081954596, + "flos": 572068790784.0, + "grad_norm": 0.09010893322506078, + "language_loss": 0.7971096, + "learning_rate": 0.0008397490976033009, + "loss": 0.80798662, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.18713379, + "step": 1478, + "time_per_iteration": 2.654254198074341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107225, + "balance_loss_mlp": 1.06009066, + "epoch": 0.28453251250480954, + "flos": 1553376310272.0, + "grad_norm": 0.04001675887347635, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78951895, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.12158203, + "step": 1479, + "time_per_iteration": 4.77993631362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088996, + "balance_loss_mlp": 1.07022035, + "epoch": 0.28472489419007313, + "flos": 749061043200.0, + "grad_norm": 0.07008895147668387, + "language_loss": 0.84977293, + "learning_rate": 0.0008392916909509525, + "loss": 0.86066294, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.18762207, + "step": 1480, + "time_per_iteration": 3.103787422180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110182, + "balance_loss_mlp": 1.08308077, + "epoch": 0.28491727587533666, + "flos": 490158563328.0, + "grad_norm": 0.07686502510285433, + "language_loss": 0.8518846, + "learning_rate": 0.0008390627899932954, + "loss": 0.86290276, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.18737793, + "step": 1481, + "time_per_iteration": 2.6177799701690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113908, + "balance_loss_mlp": 1.09524012, + "epoch": 0.28510965756060025, + "flos": 729007838208.0, + "grad_norm": 0.10214098417508043, + "language_loss": 0.88852942, + "learning_rate": 0.000838833757399789, + "loss": 0.89966846, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.18664551, + "step": 1482, + "time_per_iteration": 2.9566540718078613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114933, + "balance_loss_mlp": 1.09678972, + "epoch": 0.2853020392458638, + "flos": 551573245440.0, + "grad_norm": 0.08257095939450843, + "language_loss": 0.80571115, + "learning_rate": 0.0008386045932593515, + "loss": 0.81686044, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.18139648, + "step": 1483, + "time_per_iteration": 2.717756509780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109957, + "balance_loss_mlp": 1.09277904, + "epoch": 0.28549442093112737, + "flos": 754783557120.0, + "grad_norm": 0.07262082200825942, + "language_loss": 0.86045611, + "learning_rate": 0.0008383752976609525, + "loss": 0.87155575, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.171875, + "step": 1484, + "time_per_iteration": 2.950330972671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113011, + "balance_loss_mlp": 1.09571338, + "epoch": 0.2856868026163909, + "flos": 538589025792.0, + "grad_norm": 0.06349274760065945, + "language_loss": 0.7998122, + "learning_rate": 0.0008381458706936123, + "loss": 0.81094229, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.17321777, + "step": 1485, + "time_per_iteration": 2.750422239303589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105744, + "balance_loss_mlp": 1.08867359, + "epoch": 0.2858791843016545, + "flos": 583772977152.0, + "grad_norm": 0.08725606785490185, + "language_loss": 0.87347835, + "learning_rate": 0.0008379163124464025, + "loss": 0.88453579, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.17089844, + "step": 1486, + "time_per_iteration": 2.8127403259277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108247, + "balance_loss_mlp": 1.09145021, + "epoch": 0.286071565986918, + "flos": 644812130304.0, + "grad_norm": 0.08194161324991753, + "language_loss": 0.7704097, + "learning_rate": 0.0008376866230084452, + "loss": 0.78149223, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.16809082, + "step": 1487, + "time_per_iteration": 2.8382246494293213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.08535266, + "epoch": 0.2862639476721816, + "flos": 491361873408.0, + "grad_norm": 0.07305349361660647, + "language_loss": 0.85623455, + "learning_rate": 0.000837456802468914, + "loss": 0.8672576, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.16967773, + "step": 1488, + "time_per_iteration": 2.619359016418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101386, + "balance_loss_mlp": 1.08414829, + "epoch": 0.2864563293574452, + "flos": 521639170560.0, + "grad_norm": 0.08101706440693511, + "language_loss": 0.85233498, + "learning_rate": 0.0008372268509170331, + "loss": 0.86334878, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.17248535, + "step": 1489, + "time_per_iteration": 2.735579252243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104312, + "balance_loss_mlp": 1.08728886, + "epoch": 0.2866487110427087, + "flos": 547118281728.0, + "grad_norm": 0.09066736504037358, + "language_loss": 0.84989464, + "learning_rate": 0.0008369967684420779, + "loss": 0.86093777, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.17041016, + "step": 1490, + "time_per_iteration": 2.7550840377807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099327, + "balance_loss_mlp": 1.08251846, + "epoch": 0.2868410927279723, + "flos": 482224720896.0, + "grad_norm": 0.11208283725325253, + "language_loss": 0.84236765, + "learning_rate": 0.0008367665551333736, + "loss": 0.85336089, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.16821289, + "step": 1491, + "time_per_iteration": 2.6229591369628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118339, + "balance_loss_mlp": 1.10114861, + "epoch": 0.28703347441323585, + "flos": 724889129472.0, + "grad_norm": 0.08256436767566132, + "language_loss": 0.85062146, + "learning_rate": 0.0008365362110802977, + "loss": 0.86180484, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.17211914, + "step": 1492, + "time_per_iteration": 2.871260166168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139482, + "balance_loss_mlp": 1.12254202, + "epoch": 0.28722585609849943, + "flos": 634978248192.0, + "grad_norm": 0.14712707580735673, + "language_loss": 0.82232606, + "learning_rate": 0.0008363057363722773, + "loss": 0.83372086, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.16955566, + "step": 1493, + "time_per_iteration": 2.8748109340667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156529, + "balance_loss_mlp": 1.14010167, + "epoch": 0.28741823778376296, + "flos": 510229020672.0, + "grad_norm": 0.10196458183452421, + "language_loss": 0.84016562, + "learning_rate": 0.0008360751310987906, + "loss": 0.85173088, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.16430664, + "step": 1494, + "time_per_iteration": 2.6634154319763184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156202, + "balance_loss_mlp": 1.13989449, + "epoch": 0.28761061946902655, + "flos": 603752030208.0, + "grad_norm": 0.07806891614800103, + "language_loss": 0.85166085, + "learning_rate": 0.0008358443953493666, + "loss": 0.8632229, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.16308594, + "step": 1495, + "time_per_iteration": 2.875852584838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161766, + "balance_loss_mlp": 1.1449573, + "epoch": 0.28780300115429014, + "flos": 407193329664.0, + "grad_norm": 0.11619662908019952, + "language_loss": 0.88208884, + "learning_rate": 0.0008356135292135851, + "loss": 0.89370644, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.16821289, + "step": 1496, + "time_per_iteration": 2.5129776000976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129263, + "balance_loss_mlp": 1.11256182, + "epoch": 0.28799538283955367, + "flos": 374929357824.0, + "grad_norm": 0.0960393188024377, + "language_loss": 0.91794455, + "learning_rate": 0.0008353825327810758, + "loss": 0.92923725, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.16711426, + "step": 1497, + "time_per_iteration": 2.437619686126709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109969, + "balance_loss_mlp": 1.09312487, + "epoch": 0.28818776452481726, + "flos": 591919363584.0, + "grad_norm": 0.09345990074491838, + "language_loss": 0.81679749, + "learning_rate": 0.00083515140614152, + "loss": 0.82789719, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.1685791, + "step": 1498, + "time_per_iteration": 2.7478325366973877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119738, + "balance_loss_mlp": 1.10310864, + "epoch": 0.2883801462100808, + "flos": 535075642368.0, + "grad_norm": 0.10003726096036522, + "language_loss": 0.868577, + "learning_rate": 0.0008349201493846485, + "loss": 0.87977445, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.16625977, + "step": 1499, + "time_per_iteration": 2.639324188232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116843, + "balance_loss_mlp": 1.09971237, + "epoch": 0.2885725278953444, + "flos": 480094884864.0, + "grad_norm": 0.07951211502216154, + "language_loss": 0.89032578, + "learning_rate": 0.0008346887626002432, + "loss": 0.90149426, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.17150879, + "step": 1500, + "time_per_iteration": 2.542311668395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120306, + "balance_loss_mlp": 1.10360527, + "epoch": 0.2887649095806079, + "flos": 464044391424.0, + "grad_norm": 0.0665017309713035, + "language_loss": 0.85912937, + "learning_rate": 0.000834457245878137, + "loss": 0.87033248, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.16711426, + "step": 1501, + "time_per_iteration": 2.639570951461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122516, + "balance_loss_mlp": 1.10619664, + "epoch": 0.2889572912658715, + "flos": 931032092160.0, + "grad_norm": 0.07589763823888349, + "language_loss": 0.80857193, + "learning_rate": 0.000834225599308212, + "loss": 0.81979704, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.16320801, + "step": 1502, + "time_per_iteration": 3.2867560386657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113369, + "balance_loss_mlp": 1.11684537, + "epoch": 0.28914967295113503, + "flos": 570129103872.0, + "grad_norm": 0.09000351929941647, + "language_loss": 0.84986663, + "learning_rate": 0.0008339938229804016, + "loss": 0.86120355, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.1685791, + "step": 1503, + "time_per_iteration": 2.7262394428253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167456, + "balance_loss_mlp": 1.15496254, + "epoch": 0.2893420546363986, + "flos": 1486614643200.0, + "grad_norm": 0.04837114619258858, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.7660228, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.12451172, + "step": 1504, + "time_per_iteration": 4.9622483253479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129895, + "balance_loss_mlp": 1.11289549, + "epoch": 0.2895344363216622, + "flos": 470186850816.0, + "grad_norm": 0.1124140207378676, + "language_loss": 0.83872616, + "learning_rate": 0.0008335298814111094, + "loss": 0.85002512, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.17016602, + "step": 1505, + "time_per_iteration": 2.6357829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133506, + "balance_loss_mlp": 1.11616087, + "epoch": 0.28972681800692573, + "flos": 648194835456.0, + "grad_norm": 0.09211411957598506, + "language_loss": 0.87906271, + "learning_rate": 0.0008332977163497455, + "loss": 0.89039779, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.17370605, + "step": 1506, + "time_per_iteration": 2.798208475112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123584, + "balance_loss_mlp": 1.10653734, + "epoch": 0.2899191996921893, + "flos": 572224435200.0, + "grad_norm": 0.07286788522172229, + "language_loss": 0.83603442, + "learning_rate": 0.0008330654218907325, + "loss": 0.84727025, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.1706543, + "step": 1507, + "time_per_iteration": 2.708980083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112441, + "balance_loss_mlp": 1.09509647, + "epoch": 0.29011158137745285, + "flos": 661356721152.0, + "grad_norm": 0.06462764814837715, + "language_loss": 0.8140111, + "learning_rate": 0.0008328329981242548, + "loss": 0.82513553, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.17358398, + "step": 1508, + "time_per_iteration": 2.894169330596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110509, + "balance_loss_mlp": 1.08767331, + "epoch": 0.29030396306271644, + "flos": 536226822144.0, + "grad_norm": 0.08188322832397743, + "language_loss": 0.87448251, + "learning_rate": 0.0008326004451405475, + "loss": 0.88553333, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.17443848, + "step": 1509, + "time_per_iteration": 2.8026657104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092866, + "balance_loss_mlp": 1.07596231, + "epoch": 0.29049634474798, + "flos": 511956163584.0, + "grad_norm": 0.07862145855051805, + "language_loss": 0.81981707, + "learning_rate": 0.0008323677630298957, + "loss": 0.8307457, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.16918945, + "step": 1510, + "time_per_iteration": 2.6314613819122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109107, + "balance_loss_mlp": 1.07407045, + "epoch": 0.29068872643324356, + "flos": 613758809088.0, + "grad_norm": 0.06795291351042136, + "language_loss": 0.84809089, + "learning_rate": 0.0008321349518826345, + "loss": 0.85900158, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.17016602, + "step": 1511, + "time_per_iteration": 2.8404459953308105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086604, + "balance_loss_mlp": 1.06950927, + "epoch": 0.2908811081185071, + "flos": 546424123392.0, + "grad_norm": 0.11455853074779208, + "language_loss": 0.95139891, + "learning_rate": 0.0008319020117891491, + "loss": 0.96226501, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.17102051, + "step": 1512, + "time_per_iteration": 2.6767001152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084061, + "balance_loss_mlp": 1.06650186, + "epoch": 0.2910734898037707, + "flos": 604792355328.0, + "grad_norm": 0.0847466939070868, + "language_loss": 0.86754417, + "learning_rate": 0.0008316689428398751, + "loss": 0.87838477, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.17565918, + "step": 1513, + "time_per_iteration": 2.7069385051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079727, + "balance_loss_mlp": 1.06318033, + "epoch": 0.29126587148903427, + "flos": 574672900608.0, + "grad_norm": 0.1225209310639027, + "language_loss": 0.88519126, + "learning_rate": 0.0008314357451252979, + "loss": 0.89598852, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.16552734, + "step": 1514, + "time_per_iteration": 2.8014771938323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088545, + "balance_loss_mlp": 1.07215357, + "epoch": 0.2914582531742978, + "flos": 571068112896.0, + "grad_norm": 0.09390151153588368, + "language_loss": 0.87912899, + "learning_rate": 0.0008312024187359527, + "loss": 0.89001441, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.16394043, + "step": 1515, + "time_per_iteration": 2.646131992340088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089961, + "balance_loss_mlp": 1.07367659, + "epoch": 0.2916506348595614, + "flos": 730878142464.0, + "grad_norm": 0.0632997915526053, + "language_loss": 0.87038326, + "learning_rate": 0.000830968963762425, + "loss": 0.88128293, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.1628418, + "step": 1516, + "time_per_iteration": 3.0603909492492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104224, + "balance_loss_mlp": 1.08745098, + "epoch": 0.2918430165448249, + "flos": 510468728832.0, + "grad_norm": 0.08225160647217689, + "language_loss": 0.83996677, + "learning_rate": 0.0008307353802953497, + "loss": 0.85100901, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.16784668, + "step": 1517, + "time_per_iteration": 2.7085869312286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105905, + "balance_loss_mlp": 1.08885777, + "epoch": 0.2920353982300885, + "flos": 630397375488.0, + "grad_norm": 0.07719324020211826, + "language_loss": 0.85852122, + "learning_rate": 0.0008305016684254125, + "loss": 0.86958027, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.17053223, + "step": 1518, + "time_per_iteration": 2.843050241470337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114876, + "balance_loss_mlp": 1.0979718, + "epoch": 0.29222777991535204, + "flos": 501662688768.0, + "grad_norm": 0.07921278172023684, + "language_loss": 0.86861145, + "learning_rate": 0.0008302678282433479, + "loss": 0.87976027, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.16918945, + "step": 1519, + "time_per_iteration": 2.605964422225952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122329, + "balance_loss_mlp": 1.10534143, + "epoch": 0.2924201616006156, + "flos": 486785769984.0, + "grad_norm": 0.07975311040882123, + "language_loss": 0.84663725, + "learning_rate": 0.0008300338598399411, + "loss": 0.85786051, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.17004395, + "step": 1520, + "time_per_iteration": 2.6344962120056152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128968, + "balance_loss_mlp": 1.11150408, + "epoch": 0.2926125432858792, + "flos": 476450449920.0, + "grad_norm": 0.07139673380832469, + "language_loss": 0.9444648, + "learning_rate": 0.0008297997633060263, + "loss": 0.95575452, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.17480469, + "step": 1521, + "time_per_iteration": 2.5109918117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123567, + "balance_loss_mlp": 1.10605538, + "epoch": 0.29280492497114274, + "flos": 676675980288.0, + "grad_norm": 0.07755113838475138, + "language_loss": 0.84917367, + "learning_rate": 0.0008295655387324883, + "loss": 0.86040938, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.17529297, + "step": 1522, + "time_per_iteration": 2.8314778804779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132674, + "balance_loss_mlp": 1.11578202, + "epoch": 0.29299730665640633, + "flos": 458408512512.0, + "grad_norm": 0.08909358029202981, + "language_loss": 0.84779286, + "learning_rate": 0.0008293311862102609, + "loss": 0.85911965, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.16894531, + "step": 1523, + "time_per_iteration": 2.5455641746520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112826, + "balance_loss_mlp": 1.11147499, + "epoch": 0.29318968834166986, + "flos": 446573274624.0, + "grad_norm": 0.07268877656623862, + "language_loss": 0.88628173, + "learning_rate": 0.0008290967058303275, + "loss": 0.89756435, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.16796875, + "step": 1524, + "time_per_iteration": 2.5151915550231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114491, + "balance_loss_mlp": 1.1288048, + "epoch": 0.29338207002693345, + "flos": 450319025664.0, + "grad_norm": 0.07556317822889831, + "language_loss": 0.86503643, + "learning_rate": 0.0008288620976837219, + "loss": 0.87648547, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.16101074, + "step": 1525, + "time_per_iteration": 2.526381731033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145799, + "balance_loss_mlp": 1.12897861, + "epoch": 0.293574451712197, + "flos": 502277925888.0, + "grad_norm": 0.07322803654736391, + "language_loss": 0.826621, + "learning_rate": 0.000828627361861527, + "loss": 0.83807898, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.16833496, + "step": 1526, + "time_per_iteration": 2.629249334335327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146511, + "balance_loss_mlp": 1.13019073, + "epoch": 0.29376683339746057, + "flos": 696462312960.0, + "grad_norm": 0.08423530938833095, + "language_loss": 0.84572363, + "learning_rate": 0.0008283924984548752, + "loss": 0.8571887, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.16320801, + "step": 1527, + "time_per_iteration": 2.966165542602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140198, + "balance_loss_mlp": 1.12374687, + "epoch": 0.2939592150827241, + "flos": 478590197760.0, + "grad_norm": 0.0645510946599831, + "language_loss": 0.8449617, + "learning_rate": 0.0008281575075549485, + "loss": 0.85636371, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.16455078, + "step": 1528, + "time_per_iteration": 2.58369779586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161514, + "balance_loss_mlp": 1.14954567, + "epoch": 0.2941515967679877, + "flos": 1485260831232.0, + "grad_norm": 0.05917981842870205, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.78514206, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.11962891, + "step": 1529, + "time_per_iteration": 4.658821105957031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131087, + "balance_loss_mlp": 1.1146121, + "epoch": 0.2943439784532513, + "flos": 674158132224.0, + "grad_norm": 0.08930626055051794, + "language_loss": 0.90355158, + "learning_rate": 0.0008276871436402469, + "loss": 0.91486251, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.16479492, + "step": 1530, + "time_per_iteration": 2.8411099910736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136163, + "balance_loss_mlp": 1.12017739, + "epoch": 0.2945363601385148, + "flos": 576301298688.0, + "grad_norm": 0.15569448105103711, + "language_loss": 0.87387383, + "learning_rate": 0.000827451770808083, + "loss": 0.88523543, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.15979004, + "step": 1531, + "time_per_iteration": 2.716938018798828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126528, + "balance_loss_mlp": 1.11020815, + "epoch": 0.2947287418237784, + "flos": 480655793664.0, + "grad_norm": 0.07571292712277376, + "language_loss": 0.83393914, + "learning_rate": 0.0008272162708478674, + "loss": 0.84520441, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.16320801, + "step": 1532, + "time_per_iteration": 2.589401960372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125487, + "balance_loss_mlp": 1.10926247, + "epoch": 0.2949211235090419, + "flos": 558185209344.0, + "grad_norm": 0.0702796828307527, + "language_loss": 0.85952383, + "learning_rate": 0.000826980643851029, + "loss": 0.87077868, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.16223145, + "step": 1533, + "time_per_iteration": 2.730564594268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111085, + "balance_loss_mlp": 1.09442306, + "epoch": 0.2951135051943055, + "flos": 483887623680.0, + "grad_norm": 0.090864784531222, + "language_loss": 0.84450942, + "learning_rate": 0.0008267448899090464, + "loss": 0.85561788, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.16430664, + "step": 1534, + "time_per_iteration": 2.5810909271240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116842, + "balance_loss_mlp": 1.10008121, + "epoch": 0.29530588687956905, + "flos": 550295783424.0, + "grad_norm": 0.07312583256714535, + "language_loss": 0.80780327, + "learning_rate": 0.0008265090091134473, + "loss": 0.81897163, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.16760254, + "step": 1535, + "time_per_iteration": 2.852243423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101211, + "balance_loss_mlp": 1.08464038, + "epoch": 0.29549826856483263, + "flos": 673046226432.0, + "grad_norm": 0.06558641515181687, + "language_loss": 0.80252028, + "learning_rate": 0.0008262730015558088, + "loss": 0.81353235, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.16577148, + "step": 1536, + "time_per_iteration": 2.888068675994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094086, + "balance_loss_mlp": 1.07725406, + "epoch": 0.29569065025009617, + "flos": 764666625024.0, + "grad_norm": 0.0890497395672015, + "language_loss": 0.81906033, + "learning_rate": 0.0008260368673277574, + "loss": 0.83000118, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.16845703, + "step": 1537, + "time_per_iteration": 3.1171438694000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089572, + "balance_loss_mlp": 1.07263255, + "epoch": 0.29588303193535975, + "flos": 543683819520.0, + "grad_norm": 0.08897837479493585, + "language_loss": 0.83872563, + "learning_rate": 0.0008258006065209682, + "loss": 0.84962142, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.16955566, + "step": 1538, + "time_per_iteration": 2.749382972717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083685, + "balance_loss_mlp": 1.06642318, + "epoch": 0.29607541362062334, + "flos": 596947345920.0, + "grad_norm": 0.09390517967885302, + "language_loss": 0.80569965, + "learning_rate": 0.0008255642192271657, + "loss": 0.81653649, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.17285156, + "step": 1539, + "time_per_iteration": 2.834967851638794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.07543612, + "epoch": 0.29626779530588687, + "flos": 609877237248.0, + "grad_norm": 0.08140985627423285, + "language_loss": 0.8348605, + "learning_rate": 0.0008253277055381241, + "loss": 0.84579086, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.17602539, + "step": 1540, + "time_per_iteration": 2.8553531169891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109997, + "balance_loss_mlp": 1.08266127, + "epoch": 0.29646017699115046, + "flos": 867430674432.0, + "grad_norm": 0.07492894951417867, + "language_loss": 0.8559624, + "learning_rate": 0.0008250910655456658, + "loss": 0.86696208, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.17321777, + "step": 1541, + "time_per_iteration": 3.141746997833252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_mlp": 1.10318387, + "epoch": 0.296652558676414, + "flos": 495868594176.0, + "grad_norm": 0.0890193674873045, + "language_loss": 0.83764815, + "learning_rate": 0.0008248542993416625, + "loss": 0.84886062, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.18054199, + "step": 1542, + "time_per_iteration": 2.634694814682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134671, + "balance_loss_mlp": 1.11682534, + "epoch": 0.2968449403616776, + "flos": 571544957952.0, + "grad_norm": 0.08265783697410327, + "language_loss": 0.83617258, + "learning_rate": 0.0008246174070180352, + "loss": 0.84751928, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.17871094, + "step": 1543, + "time_per_iteration": 2.7335524559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139596, + "balance_loss_mlp": 1.12247741, + "epoch": 0.2970373220469411, + "flos": 794168271360.0, + "grad_norm": 0.09383563779300157, + "language_loss": 0.83888161, + "learning_rate": 0.0008243803886667537, + "loss": 0.85027754, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.17138672, + "step": 1544, + "time_per_iteration": 3.1672377586364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139138, + "balance_loss_mlp": 1.12212706, + "epoch": 0.2972297037322047, + "flos": 661038091776.0, + "grad_norm": 0.09212665263146659, + "language_loss": 0.7881431, + "learning_rate": 0.0008241432443798364, + "loss": 0.79953444, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.17028809, + "step": 1545, + "time_per_iteration": 2.8234944343566895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128999, + "balance_loss_mlp": 1.11242867, + "epoch": 0.29742208541746823, + "flos": 597125385216.0, + "grad_norm": 0.056688876570847646, + "language_loss": 0.85312325, + "learning_rate": 0.0008239059742493512, + "loss": 0.86441326, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.16577148, + "step": 1546, + "time_per_iteration": 2.7027690410614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134853, + "balance_loss_mlp": 1.11818719, + "epoch": 0.2976144671027318, + "flos": 769882558464.0, + "grad_norm": 0.09085945068897121, + "language_loss": 0.87215161, + "learning_rate": 0.0008236685783674142, + "loss": 0.8835001, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.16674805, + "step": 1547, + "time_per_iteration": 3.0873892307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01183829, + "balance_loss_mlp": 1.1713357, + "epoch": 0.2978068487879954, + "flos": 1484764162560.0, + "grad_norm": 0.05428295829147524, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77405024, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.12451172, + "step": 1548, + "time_per_iteration": 4.899101972579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134552, + "balance_loss_mlp": 1.11795831, + "epoch": 0.29799923047325894, + "flos": 475328632320.0, + "grad_norm": 0.08128040699091903, + "language_loss": 0.81818366, + "learning_rate": 0.0008231934097178955, + "loss": 0.82952917, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.16601562, + "step": 1549, + "time_per_iteration": 2.6477086544036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139569, + "balance_loss_mlp": 1.12291551, + "epoch": 0.2981916121585225, + "flos": 759804198912.0, + "grad_norm": 0.07828537838902122, + "language_loss": 0.85219073, + "learning_rate": 0.0008229556371347903, + "loss": 0.86358643, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.16650391, + "step": 1550, + "time_per_iteration": 3.0261847972869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150744, + "balance_loss_mlp": 1.13455498, + "epoch": 0.29838399384378606, + "flos": 875016152064.0, + "grad_norm": 0.08823136620200941, + "language_loss": 0.78994125, + "learning_rate": 0.0008227177391691874, + "loss": 0.8014487, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.16186523, + "step": 1551, + "time_per_iteration": 3.180002212524414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136289, + "balance_loss_mlp": 1.11980236, + "epoch": 0.29857637552904964, + "flos": 579661608960.0, + "grad_norm": 0.07744125357066795, + "language_loss": 0.89299029, + "learning_rate": 0.0008224797159134463, + "loss": 0.90435314, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.16491699, + "step": 1552, + "time_per_iteration": 2.739584445953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129762, + "balance_loss_mlp": 1.11325169, + "epoch": 0.2987687572143132, + "flos": 836399748096.0, + "grad_norm": 0.07274609898716765, + "language_loss": 0.83059317, + "learning_rate": 0.0008222415674599765, + "loss": 0.84189081, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.16516113, + "step": 1553, + "time_per_iteration": 3.1217970848083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118455, + "balance_loss_mlp": 1.10149145, + "epoch": 0.29896113889957676, + "flos": 567072741888.0, + "grad_norm": 0.07468995972707258, + "language_loss": 0.82944036, + "learning_rate": 0.0008220032939012349, + "loss": 0.84062493, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.1697998, + "step": 1554, + "time_per_iteration": 2.737661600112915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_mlp": 1.0940038, + "epoch": 0.29915352058484035, + "flos": 498662853120.0, + "grad_norm": 0.06534643910619843, + "language_loss": 0.87635672, + "learning_rate": 0.0008217648953297277, + "loss": 0.88746935, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.17272949, + "step": 1555, + "time_per_iteration": 2.9030354022979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109118, + "balance_loss_mlp": 1.09171319, + "epoch": 0.2993459022701039, + "flos": 592112083968.0, + "grad_norm": 0.07926146627709543, + "language_loss": 0.78007799, + "learning_rate": 0.0008215263718380095, + "loss": 0.79116917, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.17419434, + "step": 1556, + "time_per_iteration": 2.7085471153259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102748, + "balance_loss_mlp": 1.08450937, + "epoch": 0.29953828395536747, + "flos": 572380079616.0, + "grad_norm": 0.0948368117579541, + "language_loss": 0.84609628, + "learning_rate": 0.0008212877235186833, + "loss": 0.85712373, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.18237305, + "step": 1557, + "time_per_iteration": 2.7050936222076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136692, + "balance_loss_mlp": 1.12467551, + "epoch": 0.299730665640631, + "flos": 1504698425856.0, + "grad_norm": 0.04579697638503373, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78874254, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12011719, + "step": 1558, + "time_per_iteration": 4.93830418586731 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098204, + "balance_loss_mlp": 1.08031082, + "epoch": 0.2999230473258946, + "flos": 513791963136.0, + "grad_norm": 0.08681594057082924, + "language_loss": 0.81027186, + "learning_rate": 0.0008208100527678611, + "loss": 0.8212539, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.17907715, + "step": 1559, + "time_per_iteration": 2.6041250228881836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.08412552, + "epoch": 0.3001154290111581, + "flos": 834472544256.0, + "grad_norm": 0.11630596930036842, + "language_loss": 0.78128254, + "learning_rate": 0.0008205710305218135, + "loss": 0.79229701, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.17333984, + "step": 1560, + "time_per_iteration": 3.0562148094177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109931, + "balance_loss_mlp": 1.08225095, + "epoch": 0.3003078106964217, + "flos": 556776695808.0, + "grad_norm": 0.07630099015555136, + "language_loss": 0.89525402, + "learning_rate": 0.0008203318838190541, + "loss": 0.90624714, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.17077637, + "step": 1561, + "time_per_iteration": 2.7627954483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110369, + "balance_loss_mlp": 1.08669066, + "epoch": 0.30050019238168524, + "flos": 526151033856.0, + "grad_norm": 0.09266250591977641, + "language_loss": 0.84876859, + "learning_rate": 0.0008200926127524281, + "loss": 0.85980552, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.17016602, + "step": 1562, + "time_per_iteration": 2.699997663497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111077, + "balance_loss_mlp": 1.09415245, + "epoch": 0.3006925740669488, + "flos": 577852973568.0, + "grad_norm": 0.08848358123460635, + "language_loss": 0.82834399, + "learning_rate": 0.0008198532174148289, + "loss": 0.83945167, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.16625977, + "step": 1563, + "time_per_iteration": 2.728264570236206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.07691729, + "epoch": 0.3008849557522124, + "flos": 1490246595072.0, + "grad_norm": 0.03477061119396021, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81774914, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11816406, + "step": 1564, + "time_per_iteration": 4.858918905258179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148782, + "balance_loss_mlp": 1.13198543, + "epoch": 0.30107733743747594, + "flos": 509816415744.0, + "grad_norm": 0.1259196892608865, + "language_loss": 0.88896626, + "learning_rate": 0.0008193740542985244, + "loss": 0.9004541, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.16809082, + "step": 1565, + "time_per_iteration": 2.6722562313079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165467, + "balance_loss_mlp": 1.14907598, + "epoch": 0.30126971912273953, + "flos": 587704108032.0, + "grad_norm": 0.1324055806972963, + "language_loss": 0.86720473, + "learning_rate": 0.0008191342867058467, + "loss": 0.8788594, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.16394043, + "step": 1566, + "time_per_iteration": 2.7314035892486572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147033, + "balance_loss_mlp": 1.13058197, + "epoch": 0.30146210080800306, + "flos": 602101610496.0, + "grad_norm": 0.09630003386887155, + "language_loss": 0.83068216, + "learning_rate": 0.0008188943952142509, + "loss": 0.84215248, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.16455078, + "step": 1567, + "time_per_iteration": 2.8423235416412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128472, + "balance_loss_mlp": 1.11148453, + "epoch": 0.30165448249326665, + "flos": 917796054528.0, + "grad_norm": 0.09368409570014515, + "language_loss": 0.82277513, + "learning_rate": 0.0008186543799168711, + "loss": 0.83405983, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.17004395, + "step": 1568, + "time_per_iteration": 3.1569459438323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.07919598, + "epoch": 0.3018468641785302, + "flos": 777287798784.0, + "grad_norm": 0.20562474195503389, + "language_loss": 0.88231719, + "learning_rate": 0.0008184142409068892, + "loss": 0.89327747, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.16845703, + "step": 1569, + "time_per_iteration": 3.0334763526916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089793, + "balance_loss_mlp": 1.0729959, + "epoch": 0.30203924586379377, + "flos": 522358295040.0, + "grad_norm": 0.06986495925142319, + "language_loss": 0.86445761, + "learning_rate": 0.000818173978277536, + "loss": 0.87535548, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.16809082, + "step": 1570, + "time_per_iteration": 2.6637074947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085635, + "balance_loss_mlp": 1.06840897, + "epoch": 0.3022316275490573, + "flos": 524559711744.0, + "grad_norm": 0.09310337511128065, + "language_loss": 0.8345744, + "learning_rate": 0.000817933592122089, + "loss": 0.84543073, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.17236328, + "step": 1571, + "time_per_iteration": 2.693112850189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085619, + "balance_loss_mlp": 1.06780863, + "epoch": 0.3024240092343209, + "flos": 479912076288.0, + "grad_norm": 0.10986906736250873, + "language_loss": 0.83327937, + "learning_rate": 0.0008176930825338749, + "loss": 0.84413558, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.17810059, + "step": 1572, + "time_per_iteration": 2.609584331512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.06848717, + "epoch": 0.3026163909195845, + "flos": 687206592000.0, + "grad_norm": 0.10627002925019795, + "language_loss": 0.88423979, + "learning_rate": 0.0008174524496062679, + "loss": 0.89510572, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.1809082, + "step": 1573, + "time_per_iteration": 2.9317731857299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085921, + "balance_loss_mlp": 1.06767023, + "epoch": 0.302808772604848, + "flos": 542940102144.0, + "grad_norm": 0.08890838553235277, + "language_loss": 0.85423905, + "learning_rate": 0.0008172116934326894, + "loss": 0.86509824, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.18249512, + "step": 1574, + "time_per_iteration": 2.795232057571411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.06757045, + "epoch": 0.3030011542901116, + "flos": 475091495424.0, + "grad_norm": 0.0994527497506169, + "language_loss": 0.87673843, + "learning_rate": 0.0008169708141066097, + "loss": 0.88759637, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.18212891, + "step": 1575, + "time_per_iteration": 2.587369203567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088984, + "balance_loss_mlp": 1.07053041, + "epoch": 0.30319353597537513, + "flos": 481481003520.0, + "grad_norm": 0.4142555186010625, + "language_loss": 0.90523762, + "learning_rate": 0.0008167298117215465, + "loss": 0.91612744, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.18432617, + "step": 1576, + "time_per_iteration": 2.591120481491089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109689, + "balance_loss_mlp": 1.07822132, + "epoch": 0.3033859176606387, + "flos": 704786365440.0, + "grad_norm": 0.08528414160414997, + "language_loss": 0.87905335, + "learning_rate": 0.0008164886863710649, + "loss": 0.89002216, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.18652344, + "step": 1577, + "time_per_iteration": 2.9462757110595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130352, + "balance_loss_mlp": 1.11145782, + "epoch": 0.30357829934590225, + "flos": 764696360448.0, + "grad_norm": 0.07426584678404557, + "language_loss": 0.85645878, + "learning_rate": 0.0008162474381487783, + "loss": 0.86776227, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.1887207, + "step": 1578, + "time_per_iteration": 3.1258718967437744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01170066, + "balance_loss_mlp": 1.15105188, + "epoch": 0.30377068103116583, + "flos": 532355162112.0, + "grad_norm": 0.10196200235578438, + "language_loss": 0.849518, + "learning_rate": 0.0008160060671483475, + "loss": 0.86121869, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.19018555, + "step": 1579, + "time_per_iteration": 2.686903953552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193624, + "balance_loss_mlp": 1.17542076, + "epoch": 0.3039630627164294, + "flos": 510191944704.0, + "grad_norm": 0.11175205501845424, + "language_loss": 0.82875144, + "learning_rate": 0.0008157645734634809, + "loss": 0.84068769, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.18212891, + "step": 1580, + "time_per_iteration": 2.623169183731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146657, + "balance_loss_mlp": 1.13449764, + "epoch": 0.30415544440169295, + "flos": 1506000854016.0, + "grad_norm": 0.05359937724929427, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.78043151, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.12158203, + "step": 1581, + "time_per_iteration": 4.941681623458862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126623, + "balance_loss_mlp": 1.11465442, + "epoch": 0.30434782608695654, + "flos": 1458736625664.0, + "grad_norm": 0.04979857074148905, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74341118, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11962891, + "step": 1582, + "time_per_iteration": 4.878013372421265 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01233201, + "balance_loss_mlp": 1.21421146, + "epoch": 0.3045402077722201, + "flos": 482555833344.0, + "grad_norm": 0.08528831092857085, + "language_loss": 0.8396011, + "learning_rate": 0.000815039357240067, + "loss": 0.85193312, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.18969727, + "step": 1583, + "time_per_iteration": 2.643695116043091 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228928, + "balance_loss_mlp": 1.21003366, + "epoch": 0.30473258945748366, + "flos": 543501010944.0, + "grad_norm": 0.10406683839721904, + "language_loss": 0.8531003, + "learning_rate": 0.0008147973737554952, + "loss": 0.86538959, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.18884277, + "step": 1584, + "time_per_iteration": 2.780329942703247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01201707, + "balance_loss_mlp": 1.18393278, + "epoch": 0.3049249711427472, + "flos": 567055489536.0, + "grad_norm": 0.07761853967257432, + "language_loss": 0.86104375, + "learning_rate": 0.000814555268055744, + "loss": 0.87306082, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.17785645, + "step": 1585, + "time_per_iteration": 2.6921656131744385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196281, + "balance_loss_mlp": 1.17799401, + "epoch": 0.3051173528280108, + "flos": 528233882112.0, + "grad_norm": 0.07850387771459345, + "language_loss": 0.86948889, + "learning_rate": 0.0008143130402348073, + "loss": 0.88145167, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.18273926, + "step": 1586, + "time_per_iteration": 2.6515746116638184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165884, + "balance_loss_mlp": 1.14803839, + "epoch": 0.3053097345132743, + "flos": 586396910592.0, + "grad_norm": 0.0685053805978033, + "language_loss": 0.79063147, + "learning_rate": 0.0008140706903867265, + "loss": 0.80229032, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.17858887, + "step": 1587, + "time_per_iteration": 2.823451042175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158917, + "balance_loss_mlp": 1.14067745, + "epoch": 0.3055021161985379, + "flos": 607087747584.0, + "grad_norm": 0.09375856425609289, + "language_loss": 0.90278405, + "learning_rate": 0.0008138282186055897, + "loss": 0.91437322, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.18261719, + "step": 1588, + "time_per_iteration": 2.7146568298339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147472, + "balance_loss_mlp": 1.12988853, + "epoch": 0.3056944978838015, + "flos": 573867514368.0, + "grad_norm": 0.0770581210118419, + "language_loss": 0.82476223, + "learning_rate": 0.0008135856249855331, + "loss": 0.83623695, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.17614746, + "step": 1589, + "time_per_iteration": 2.71938157081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141231, + "balance_loss_mlp": 1.12317085, + "epoch": 0.305886879569065, + "flos": 633925813248.0, + "grad_norm": 0.10579892777067937, + "language_loss": 0.89201659, + "learning_rate": 0.0008133429096207398, + "loss": 0.90342891, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.18066406, + "step": 1590, + "time_per_iteration": 2.828059434890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01326323, + "balance_loss_mlp": 1.31087315, + "epoch": 0.3060792612543286, + "flos": 1369005981696.0, + "grad_norm": 0.09384482719125187, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76638579, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.15429688, + "step": 1591, + "time_per_iteration": 5.056639909744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158477, + "balance_loss_mlp": 1.13997602, + "epoch": 0.30627164293959214, + "flos": 518555644416.0, + "grad_norm": 0.07055782584393462, + "language_loss": 0.86496353, + "learning_rate": 0.0008128571140339123, + "loss": 0.87654829, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.18505859, + "step": 1592, + "time_per_iteration": 2.6639931201934814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148416, + "balance_loss_mlp": 1.12930679, + "epoch": 0.3064640246248557, + "flos": 455589287424.0, + "grad_norm": 0.0722691659040447, + "language_loss": 0.87266612, + "learning_rate": 0.0008126140340004805, + "loss": 0.88415021, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.19104004, + "step": 1593, + "time_per_iteration": 2.574216604232788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153692, + "balance_loss_mlp": 1.1345824, + "epoch": 0.30665640631011926, + "flos": 850095378432.0, + "grad_norm": 0.07242693719108233, + "language_loss": 0.81765437, + "learning_rate": 0.0008123708325995172, + "loss": 0.82919127, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.19104004, + "step": 1594, + "time_per_iteration": 3.2430498600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160899, + "balance_loss_mlp": 1.14182544, + "epoch": 0.30684878799538284, + "flos": 758319335424.0, + "grad_norm": 0.08669645453401467, + "language_loss": 0.79659396, + "learning_rate": 0.0008121275099254414, + "loss": 0.80820298, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.19067383, + "step": 1595, + "time_per_iteration": 2.992558479309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116479, + "balance_loss_mlp": 1.14517975, + "epoch": 0.3070411696806464, + "flos": 517574790144.0, + "grad_norm": 0.06321681758762837, + "language_loss": 0.88210988, + "learning_rate": 0.0008118840660727194, + "loss": 0.8937577, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.19592285, + "step": 1596, + "time_per_iteration": 2.655043840408325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116269, + "balance_loss_mlp": 1.14316404, + "epoch": 0.30723355136590996, + "flos": 844264207872.0, + "grad_norm": 0.06781928625830316, + "language_loss": 0.87805635, + "learning_rate": 0.0008116405011358644, + "loss": 0.88968325, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.19519043, + "step": 1597, + "time_per_iteration": 3.180513620376587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172311, + "balance_loss_mlp": 1.15260601, + "epoch": 0.30742593305117355, + "flos": 466139722752.0, + "grad_norm": 0.0749329830796044, + "language_loss": 0.79566741, + "learning_rate": 0.0008113968152094369, + "loss": 0.80739057, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.19702148, + "step": 1598, + "time_per_iteration": 2.6038942337036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164591, + "balance_loss_mlp": 1.14439654, + "epoch": 0.3076183147364371, + "flos": 686591354880.0, + "grad_norm": 0.09148494515579969, + "language_loss": 0.82006347, + "learning_rate": 0.0008111530083880438, + "loss": 0.83170938, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.2019043, + "step": 1599, + "time_per_iteration": 2.9283370971679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155155, + "balance_loss_mlp": 1.13517594, + "epoch": 0.30781069642170067, + "flos": 614018340864.0, + "grad_norm": 0.08461014219336162, + "language_loss": 0.86254573, + "learning_rate": 0.0008109090807663399, + "loss": 0.87409735, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.19970703, + "step": 1600, + "time_per_iteration": 2.825857639312744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137804, + "balance_loss_mlp": 1.11677539, + "epoch": 0.3080030781069642, + "flos": 590318129664.0, + "grad_norm": 0.062223790852464995, + "language_loss": 0.88488859, + "learning_rate": 0.0008106650324390257, + "loss": 0.89626658, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.21032715, + "step": 1601, + "time_per_iteration": 2.8589255809783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112197, + "balance_loss_mlp": 1.10128665, + "epoch": 0.3081954597922278, + "flos": 562620349440.0, + "grad_norm": 0.07165476987233708, + "language_loss": 0.81206429, + "learning_rate": 0.0008104208635008493, + "loss": 0.82328397, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.20690918, + "step": 1602, + "time_per_iteration": 2.6751368045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109456, + "balance_loss_mlp": 1.0886662, + "epoch": 0.3083878414774913, + "flos": 447830913024.0, + "grad_norm": 0.08196336802935668, + "language_loss": 0.81529546, + "learning_rate": 0.0008101765740466058, + "loss": 0.82638997, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.20788574, + "step": 1603, + "time_per_iteration": 2.5513291358947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103563, + "balance_loss_mlp": 1.08332109, + "epoch": 0.3085802231627549, + "flos": 493546037760.0, + "grad_norm": 0.0890222565523069, + "language_loss": 0.83796382, + "learning_rate": 0.0008099321641711364, + "loss": 0.8489995, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.20227051, + "step": 1604, + "time_per_iteration": 2.6779870986938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104404, + "balance_loss_mlp": 1.08353007, + "epoch": 0.3087726048480185, + "flos": 487687703040.0, + "grad_norm": 0.07300879059514653, + "language_loss": 0.83213902, + "learning_rate": 0.0008096876339693295, + "loss": 0.84318304, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.2088623, + "step": 1605, + "time_per_iteration": 2.667900800704956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.07006013, + "epoch": 0.308964986533282, + "flos": 730589248512.0, + "grad_norm": 0.08337461956862639, + "language_loss": 0.81168187, + "learning_rate": 0.0008094429835361206, + "loss": 0.82259107, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.20861816, + "step": 1606, + "time_per_iteration": 3.0076494216918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081794, + "balance_loss_mlp": 1.06069374, + "epoch": 0.3091573682185456, + "flos": 605407592448.0, + "grad_norm": 0.10542585380202701, + "language_loss": 0.85789704, + "learning_rate": 0.0008091982129664908, + "loss": 0.86871505, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.21105957, + "step": 1607, + "time_per_iteration": 2.730372428894043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.06643414, + "epoch": 0.30934974990380915, + "flos": 460325804544.0, + "grad_norm": 0.07933352528165237, + "language_loss": 0.83225489, + "learning_rate": 0.0008089533223554687, + "loss": 0.84313411, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.21484375, + "step": 1608, + "time_per_iteration": 2.7049362659454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090604, + "balance_loss_mlp": 1.06942058, + "epoch": 0.30954213158907273, + "flos": 553426297344.0, + "grad_norm": 0.08271353671646894, + "language_loss": 0.85293424, + "learning_rate": 0.0008087083117981294, + "loss": 0.86384022, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.21179199, + "step": 1609, + "time_per_iteration": 2.8826427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101999, + "balance_loss_mlp": 1.08043373, + "epoch": 0.30973451327433627, + "flos": 553043427840.0, + "grad_norm": 0.0996721022061816, + "language_loss": 0.88292408, + "learning_rate": 0.0008084631813895943, + "loss": 0.89394403, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.21569824, + "step": 1610, + "time_per_iteration": 2.7805559635162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121386, + "balance_loss_mlp": 1.10027432, + "epoch": 0.30992689495959985, + "flos": 565696535040.0, + "grad_norm": 0.07842877021383077, + "language_loss": 0.83548594, + "learning_rate": 0.0008082179312250315, + "loss": 0.84669983, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.21118164, + "step": 1611, + "time_per_iteration": 2.676135540008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388019, + "balance_loss_mlp": 1.36951745, + "epoch": 0.3101192766448634, + "flos": 1442406776832.0, + "grad_norm": 0.08809519842771894, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.81243861, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.18457031, + "step": 1612, + "time_per_iteration": 4.860031843185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0126274, + "balance_loss_mlp": 1.24729049, + "epoch": 0.31031165833012697, + "flos": 1531892570112.0, + "grad_norm": 0.05130460412725523, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77892077, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.15429688, + "step": 1613, + "time_per_iteration": 5.034562110900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01199222, + "balance_loss_mlp": 1.18011272, + "epoch": 0.31050404001539056, + "flos": 991952676864.0, + "grad_norm": 0.0938643891544465, + "language_loss": 0.82239884, + "learning_rate": 0.0008074814631475545, + "loss": 0.83439106, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.19091797, + "step": 1614, + "time_per_iteration": 3.336702585220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212604, + "balance_loss_mlp": 1.19325638, + "epoch": 0.3106964217006541, + "flos": 445992542208.0, + "grad_norm": 0.08076281903906762, + "language_loss": 0.79283953, + "learning_rate": 0.0008072357349114907, + "loss": 0.80496556, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.19335938, + "step": 1615, + "time_per_iteration": 2.6835010051727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230065, + "balance_loss_mlp": 1.21150458, + "epoch": 0.3108888033859177, + "flos": 510505804800.0, + "grad_norm": 0.10215362910815345, + "language_loss": 0.88464314, + "learning_rate": 0.0008069898873959363, + "loss": 0.89694381, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.1854248, + "step": 1616, + "time_per_iteration": 2.669456958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01213455, + "balance_loss_mlp": 1.19514489, + "epoch": 0.3110811850711812, + "flos": 520732468224.0, + "grad_norm": 0.07300003813068634, + "language_loss": 0.85508597, + "learning_rate": 0.0008067439206963375, + "loss": 0.86722052, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.18310547, + "step": 1617, + "time_per_iteration": 2.641707420349121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01202163, + "balance_loss_mlp": 1.18378067, + "epoch": 0.3112735667564448, + "flos": 686413315584.0, + "grad_norm": 0.08997138772617237, + "language_loss": 0.86023128, + "learning_rate": 0.0008064978349081873, + "loss": 0.87225294, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.18395996, + "step": 1618, + "time_per_iteration": 2.998687982559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181054, + "balance_loss_mlp": 1.1626246, + "epoch": 0.31146594844170833, + "flos": 533061803520.0, + "grad_norm": 0.07073814720845698, + "language_loss": 0.8619715, + "learning_rate": 0.0008062516301270245, + "loss": 0.87378204, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.1842041, + "step": 1619, + "time_per_iteration": 2.72948956489563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187406, + "balance_loss_mlp": 1.16931009, + "epoch": 0.3116583301269719, + "flos": 679517227008.0, + "grad_norm": 0.06466481546389395, + "language_loss": 0.88310599, + "learning_rate": 0.0008060053064484343, + "loss": 0.89498007, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.18115234, + "step": 1620, + "time_per_iteration": 2.9406392574310303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188397, + "balance_loss_mlp": 1.17067063, + "epoch": 0.31185071181223545, + "flos": 586149861888.0, + "grad_norm": 0.09059197010434686, + "language_loss": 0.84835637, + "learning_rate": 0.0008057588639680482, + "loss": 0.86024034, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.17724609, + "step": 1621, + "time_per_iteration": 2.7712435722351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172579, + "balance_loss_mlp": 1.15451908, + "epoch": 0.31204309349749904, + "flos": 725403050496.0, + "grad_norm": 0.0998699448976919, + "language_loss": 0.83181798, + "learning_rate": 0.0008055123027815434, + "loss": 0.84354383, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.18078613, + "step": 1622, + "time_per_iteration": 2.918195962905884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158801, + "balance_loss_mlp": 1.14063358, + "epoch": 0.3122354751827626, + "flos": 576825131520.0, + "grad_norm": 0.08307305946300769, + "language_loss": 0.8472932, + "learning_rate": 0.0008052656229846436, + "loss": 0.85888124, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.18164062, + "step": 1623, + "time_per_iteration": 2.6911518573760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141486, + "balance_loss_mlp": 1.12317586, + "epoch": 0.31242785686802615, + "flos": 575943022080.0, + "grad_norm": 0.13857337515277973, + "language_loss": 0.90054119, + "learning_rate": 0.0008050188246731182, + "loss": 0.91195607, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.18322754, + "step": 1624, + "time_per_iteration": 2.682352066040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132158, + "balance_loss_mlp": 1.11350143, + "epoch": 0.31262023855328974, + "flos": 736830452736.0, + "grad_norm": 0.07575228871239431, + "language_loss": 0.81929862, + "learning_rate": 0.0008047719079427834, + "loss": 0.83062017, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.18664551, + "step": 1625, + "time_per_iteration": 2.9942879676818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01230131, + "balance_loss_mlp": 1.21601677, + "epoch": 0.3128126202385533, + "flos": 1559232073728.0, + "grad_norm": 0.048676192852424666, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75581837, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.14160156, + "step": 1626, + "time_per_iteration": 4.848233938217163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108724, + "balance_loss_mlp": 1.08925653, + "epoch": 0.31300500192381686, + "flos": 514921121280.0, + "grad_norm": 0.0694146578244244, + "language_loss": 0.86078912, + "learning_rate": 0.0008042777196091757, + "loss": 0.87187636, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.19458008, + "step": 1627, + "time_per_iteration": 2.701900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116209, + "balance_loss_mlp": 1.09631276, + "epoch": 0.3131973836090804, + "flos": 526627878912.0, + "grad_norm": 0.08749628678496815, + "language_loss": 0.81888652, + "learning_rate": 0.0008040304481977643, + "loss": 0.83004862, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.19885254, + "step": 1628, + "time_per_iteration": 2.696526527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138715, + "balance_loss_mlp": 1.11946249, + "epoch": 0.313389765294344, + "flos": 822820114944.0, + "grad_norm": 0.07447099765210985, + "language_loss": 0.8675555, + "learning_rate": 0.0008037830587512649, + "loss": 0.87894267, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.19250488, + "step": 1629, + "time_per_iteration": 3.0616016387939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134253, + "balance_loss_mlp": 1.11413062, + "epoch": 0.31358214697960757, + "flos": 393823669248.0, + "grad_norm": 0.09771619875867958, + "language_loss": 0.78561771, + "learning_rate": 0.0008035355513657224, + "loss": 0.79696023, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.20117188, + "step": 1630, + "time_per_iteration": 2.4754045009613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137016, + "balance_loss_mlp": 1.11708379, + "epoch": 0.3137745286648711, + "flos": 571908003840.0, + "grad_norm": 0.08006054346576318, + "language_loss": 0.9267844, + "learning_rate": 0.0008032879261372279, + "loss": 0.93815458, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.19921875, + "step": 1631, + "time_per_iteration": 2.802116870880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162193, + "balance_loss_mlp": 1.14845991, + "epoch": 0.3139669103501347, + "flos": 1498415376384.0, + "grad_norm": 0.027777304949473513, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80798036, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.13769531, + "step": 1632, + "time_per_iteration": 5.508919715881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119807, + "balance_loss_mlp": 1.10029221, + "epoch": 0.3141592920353982, + "flos": 525343076352.0, + "grad_norm": 0.0647776963699187, + "language_loss": 0.86985779, + "learning_rate": 0.0008027923225359748, + "loss": 0.88105589, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.19506836, + "step": 1633, + "time_per_iteration": 2.600407600402832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108986, + "balance_loss_mlp": 1.08867252, + "epoch": 0.3143516737206618, + "flos": 593268406272.0, + "grad_norm": 0.07494658582155435, + "language_loss": 0.87969911, + "learning_rate": 0.0008025443443556267, + "loss": 0.89078891, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.20300293, + "step": 1634, + "time_per_iteration": 2.721635103225708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103961, + "balance_loss_mlp": 1.08468509, + "epoch": 0.31454405540592534, + "flos": 648362589696.0, + "grad_norm": 0.09628820684288855, + "language_loss": 0.88015246, + "learning_rate": 0.000802296248717147, + "loss": 0.89119208, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.19262695, + "step": 1635, + "time_per_iteration": 2.94401478767395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090786, + "balance_loss_mlp": 1.07087779, + "epoch": 0.3147364370911889, + "flos": 642847850496.0, + "grad_norm": 0.07971253455476307, + "language_loss": 0.78918988, + "learning_rate": 0.0008020480357168554, + "loss": 0.8000977, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.19897461, + "step": 1636, + "time_per_iteration": 2.863992691040039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089663, + "balance_loss_mlp": 1.07011271, + "epoch": 0.31492881877645246, + "flos": 471849753600.0, + "grad_norm": 0.07737806088204505, + "language_loss": 0.87917638, + "learning_rate": 0.0008017997054511165, + "loss": 0.890073, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.1953125, + "step": 1637, + "time_per_iteration": 2.586543083190918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087265, + "balance_loss_mlp": 1.06765532, + "epoch": 0.31512120046171604, + "flos": 629433773568.0, + "grad_norm": 0.08038806705740831, + "language_loss": 0.85134554, + "learning_rate": 0.0008015512580163407, + "loss": 0.86221826, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.19592285, + "step": 1638, + "time_per_iteration": 2.8016490936279297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.06364322, + "epoch": 0.31531358214697963, + "flos": 703778347008.0, + "grad_norm": 0.07403915674476273, + "language_loss": 0.80143899, + "learning_rate": 0.0008013026935089838, + "loss": 0.81228203, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.20666504, + "step": 1639, + "time_per_iteration": 2.906219244003296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086238, + "balance_loss_mlp": 1.06543589, + "epoch": 0.31550596383224316, + "flos": 572545635840.0, + "grad_norm": 0.08080644571808258, + "language_loss": 0.83962494, + "learning_rate": 0.0008010540120255472, + "loss": 0.85048735, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.20788574, + "step": 1640, + "time_per_iteration": 2.6874494552612305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093894, + "balance_loss_mlp": 1.07238901, + "epoch": 0.31569834551750675, + "flos": 658340006400.0, + "grad_norm": 0.10412897550370145, + "language_loss": 0.85903674, + "learning_rate": 0.0008008052136625774, + "loss": 0.86997569, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.21508789, + "step": 1641, + "time_per_iteration": 2.806689977645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101865, + "balance_loss_mlp": 1.08080053, + "epoch": 0.3158907272027703, + "flos": 566282036736.0, + "grad_norm": 0.07569050828740802, + "language_loss": 0.86666101, + "learning_rate": 0.0008005562985166666, + "loss": 0.87767971, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.21069336, + "step": 1642, + "time_per_iteration": 2.7800753116607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109644, + "balance_loss_mlp": 1.08823395, + "epoch": 0.31608310888803387, + "flos": 536891618304.0, + "grad_norm": 0.05889143992207802, + "language_loss": 0.85174221, + "learning_rate": 0.0008003072666844524, + "loss": 0.86283863, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.21411133, + "step": 1643, + "time_per_iteration": 2.722987651824951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122613, + "balance_loss_mlp": 1.10185909, + "epoch": 0.3162754905732974, + "flos": 486669772800.0, + "grad_norm": 0.0837642836105996, + "language_loss": 0.82220256, + "learning_rate": 0.0008000581182626173, + "loss": 0.83342868, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.20751953, + "step": 1644, + "time_per_iteration": 2.5624425411224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143306, + "balance_loss_mlp": 1.12279046, + "epoch": 0.316467872258561, + "flos": 530052429312.0, + "grad_norm": 0.21399278605623545, + "language_loss": 0.85377562, + "learning_rate": 0.0007998088533478894, + "loss": 0.86520875, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.2052002, + "step": 1645, + "time_per_iteration": 2.657808542251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118356, + "balance_loss_mlp": 1.09847164, + "epoch": 0.3166602539438245, + "flos": 443440189440.0, + "grad_norm": 0.1165927047614104, + "language_loss": 0.83989012, + "learning_rate": 0.000799559472037042, + "loss": 0.85107368, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.19873047, + "step": 1646, + "time_per_iteration": 2.5764071941375732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101642, + "balance_loss_mlp": 1.08161449, + "epoch": 0.3168526356290881, + "flos": 645830060544.0, + "grad_norm": 0.06134983371250154, + "language_loss": 0.87497842, + "learning_rate": 0.0007993099744268932, + "loss": 0.88599485, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.20031738, + "step": 1647, + "time_per_iteration": 2.9123756885528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094967, + "balance_loss_mlp": 1.07502329, + "epoch": 0.3170450173143517, + "flos": 586162344960.0, + "grad_norm": 0.08774031682584008, + "language_loss": 0.87840933, + "learning_rate": 0.000799060360614307, + "loss": 0.889359, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.19934082, + "step": 1648, + "time_per_iteration": 2.7346584796905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089439, + "balance_loss_mlp": 1.06954336, + "epoch": 0.3172373989996152, + "flos": 827124203520.0, + "grad_norm": 0.07558157708493889, + "language_loss": 0.8330996, + "learning_rate": 0.0007988106306961917, + "loss": 0.84399396, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.19885254, + "step": 1649, + "time_per_iteration": 3.1326329708099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091589, + "balance_loss_mlp": 1.07182384, + "epoch": 0.3174297806848788, + "flos": 527408672256.0, + "grad_norm": 0.0875083493892423, + "language_loss": 0.84519339, + "learning_rate": 0.0007985607847695014, + "loss": 0.85610926, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.19750977, + "step": 1650, + "time_per_iteration": 2.689587354660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087502, + "balance_loss_mlp": 1.06813097, + "epoch": 0.31762216237014235, + "flos": 713179800576.0, + "grad_norm": 0.10331276722207645, + "language_loss": 0.82647395, + "learning_rate": 0.0007983108229312345, + "loss": 0.83734906, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.19348145, + "step": 1651, + "time_per_iteration": 2.935060501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094077, + "balance_loss_mlp": 1.07493234, + "epoch": 0.31781454405540593, + "flos": 483813471744.0, + "grad_norm": 0.08920057207213788, + "language_loss": 0.86297011, + "learning_rate": 0.0007980607452784351, + "loss": 0.8739109, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.19128418, + "step": 1652, + "time_per_iteration": 2.5893616676330566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090057, + "balance_loss_mlp": 1.07070947, + "epoch": 0.31800692574066947, + "flos": 548746679808.0, + "grad_norm": 0.10003790987475829, + "language_loss": 0.90127802, + "learning_rate": 0.0007978105519081919, + "loss": 0.91217864, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.1932373, + "step": 1653, + "time_per_iteration": 2.7026524543762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091886, + "balance_loss_mlp": 1.07306278, + "epoch": 0.31819930742593305, + "flos": 516895312896.0, + "grad_norm": 0.08393617058492224, + "language_loss": 0.87581307, + "learning_rate": 0.0007975602429176385, + "loss": 0.88673192, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.18811035, + "step": 1654, + "time_per_iteration": 2.652863025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110285, + "balance_loss_mlp": 1.08389616, + "epoch": 0.31839168911119664, + "flos": 455991980544.0, + "grad_norm": 0.08283763038644905, + "language_loss": 0.8141948, + "learning_rate": 0.0007973098184039536, + "loss": 0.82522333, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.18933105, + "step": 1655, + "time_per_iteration": 2.658590316772461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113313, + "balance_loss_mlp": 1.09477568, + "epoch": 0.3185840707964602, + "flos": 626033816064.0, + "grad_norm": 0.08159903981201219, + "language_loss": 0.86618698, + "learning_rate": 0.0007970592784643602, + "loss": 0.87732017, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.18518066, + "step": 1656, + "time_per_iteration": 2.892390251159668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138107, + "balance_loss_mlp": 1.11967695, + "epoch": 0.31877645248172376, + "flos": 567478006272.0, + "grad_norm": 0.07828329710087445, + "language_loss": 0.84808218, + "learning_rate": 0.0007968086231961272, + "loss": 0.85946327, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.18432617, + "step": 1657, + "time_per_iteration": 2.659250497817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169742, + "balance_loss_mlp": 1.15010786, + "epoch": 0.3189688341669873, + "flos": 489580402176.0, + "grad_norm": 0.1537731911276923, + "language_loss": 0.8331663, + "learning_rate": 0.0007965578526965671, + "loss": 0.84486371, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.19616699, + "step": 1658, + "time_per_iteration": 2.6129345893859863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115739, + "balance_loss_mlp": 1.13819742, + "epoch": 0.3191612158522509, + "flos": 576234487296.0, + "grad_norm": 0.07993574913147765, + "language_loss": 0.86468869, + "learning_rate": 0.0007963069670630377, + "loss": 0.87626261, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.19189453, + "step": 1659, + "time_per_iteration": 2.735495090484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150627, + "balance_loss_mlp": 1.13118374, + "epoch": 0.3193535975375144, + "flos": 538132004352.0, + "grad_norm": 0.07695546581371572, + "language_loss": 0.87941194, + "learning_rate": 0.0007960559663929416, + "loss": 0.8909182, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.19421387, + "step": 1660, + "time_per_iteration": 2.6464481353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144507, + "balance_loss_mlp": 1.12452734, + "epoch": 0.319545979222778, + "flos": 734288011776.0, + "grad_norm": 0.0701260521045673, + "language_loss": 0.87574112, + "learning_rate": 0.0007958048507837259, + "loss": 0.88718617, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.19995117, + "step": 1661, + "time_per_iteration": 2.964620590209961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135721, + "balance_loss_mlp": 1.11478782, + "epoch": 0.31973836090804153, + "flos": 764461794816.0, + "grad_norm": 0.08820049354030167, + "language_loss": 0.87464488, + "learning_rate": 0.0007955536203328822, + "loss": 0.88600206, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.20947266, + "step": 1662, + "time_per_iteration": 2.9402856826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128807, + "balance_loss_mlp": 1.10893452, + "epoch": 0.3199307425933051, + "flos": 560549611008.0, + "grad_norm": 0.0703581314218412, + "language_loss": 0.83491433, + "learning_rate": 0.0007953022751379469, + "loss": 0.84620237, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.1986084, + "step": 1663, + "time_per_iteration": 2.8694913387298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133101, + "balance_loss_mlp": 1.11183429, + "epoch": 0.3201231242785687, + "flos": 751349094912.0, + "grad_norm": 0.07762769933283196, + "language_loss": 0.81855732, + "learning_rate": 0.000795050815296501, + "loss": 0.82988834, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.21264648, + "step": 1664, + "time_per_iteration": 2.9839534759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133942, + "balance_loss_mlp": 1.11387873, + "epoch": 0.32031550596383224, + "flos": 496402338816.0, + "grad_norm": 0.06538130148842129, + "language_loss": 0.92802906, + "learning_rate": 0.0007947992409061695, + "loss": 0.93936849, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.20068359, + "step": 1665, + "time_per_iteration": 2.600677013397217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128395, + "balance_loss_mlp": 1.10815299, + "epoch": 0.3205078876490958, + "flos": 731609750016.0, + "grad_norm": 0.07570782620206934, + "language_loss": 0.86083347, + "learning_rate": 0.0007945475520646226, + "loss": 0.8721174, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.20227051, + "step": 1666, + "time_per_iteration": 2.960444211959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126888, + "balance_loss_mlp": 1.10798109, + "epoch": 0.32070026933435936, + "flos": 549436068864.0, + "grad_norm": 0.08296696017450861, + "language_loss": 0.84656757, + "learning_rate": 0.0007942957488695743, + "loss": 0.85783648, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.18908691, + "step": 1667, + "time_per_iteration": 2.671600341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131636, + "balance_loss_mlp": 1.11284864, + "epoch": 0.32089265101962294, + "flos": 745295468544.0, + "grad_norm": 0.06557982969248469, + "language_loss": 0.80884814, + "learning_rate": 0.0007940438314187833, + "loss": 0.82016456, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.18774414, + "step": 1668, + "time_per_iteration": 3.0618937015533447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129602, + "balance_loss_mlp": 1.11102939, + "epoch": 0.3210850327048865, + "flos": 494188439040.0, + "grad_norm": 0.08496063360517363, + "language_loss": 0.80308306, + "learning_rate": 0.0007937917998100529, + "loss": 0.8143791, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.18566895, + "step": 1669, + "time_per_iteration": 2.6219253540039062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139247, + "balance_loss_mlp": 1.12098432, + "epoch": 0.32127741439015006, + "flos": 530640502272.0, + "grad_norm": 0.07361411804364891, + "language_loss": 0.78932178, + "learning_rate": 0.0007935396541412302, + "loss": 0.80071419, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.18273926, + "step": 1670, + "time_per_iteration": 2.6380372047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148818, + "balance_loss_mlp": 1.13088846, + "epoch": 0.3214697960754136, + "flos": 501203096064.0, + "grad_norm": 0.07283292072888313, + "language_loss": 0.85630834, + "learning_rate": 0.0007932873945102068, + "loss": 0.86779654, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.17932129, + "step": 1671, + "time_per_iteration": 2.6828458309173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171107, + "balance_loss_mlp": 1.15642071, + "epoch": 0.3216621777606772, + "flos": 1383341815296.0, + "grad_norm": 0.02887484158654099, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76932883, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.14648438, + "step": 1672, + "time_per_iteration": 4.8265416622161865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160939, + "balance_loss_mlp": 1.14286733, + "epoch": 0.32185455944594077, + "flos": 571535046144.0, + "grad_norm": 0.07500648032395062, + "language_loss": 0.86484933, + "learning_rate": 0.0007927825337533461, + "loss": 0.87645876, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.18078613, + "step": 1673, + "time_per_iteration": 2.7402546405792236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155615, + "balance_loss_mlp": 1.1377933, + "epoch": 0.3220469411312043, + "flos": 543908846592.0, + "grad_norm": 0.10786589074132553, + "language_loss": 0.84594876, + "learning_rate": 0.0007925299328235131, + "loss": 0.8575049, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.17822266, + "step": 1674, + "time_per_iteration": 2.663360118865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149086, + "balance_loss_mlp": 1.13095438, + "epoch": 0.3222393228164679, + "flos": 491139417600.0, + "grad_norm": 0.09107412637612472, + "language_loss": 0.84947217, + "learning_rate": 0.000792277218323488, + "loss": 0.86096299, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.18139648, + "step": 1675, + "time_per_iteration": 2.608579158782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136338, + "balance_loss_mlp": 1.11837292, + "epoch": 0.3224317045017314, + "flos": 490388359680.0, + "grad_norm": 0.07405590971136047, + "language_loss": 0.84631819, + "learning_rate": 0.0007920243903513833, + "loss": 0.85768151, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.17956543, + "step": 1676, + "time_per_iteration": 2.598543882369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128075, + "balance_loss_mlp": 1.10991931, + "epoch": 0.322624086186995, + "flos": 575777465856.0, + "grad_norm": 0.08030295134522303, + "language_loss": 0.83944809, + "learning_rate": 0.0007917714490053556, + "loss": 0.85072881, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.18164062, + "step": 1677, + "time_per_iteration": 2.6944823265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126784, + "balance_loss_mlp": 1.10863996, + "epoch": 0.32281646787225854, + "flos": 629292810240.0, + "grad_norm": 0.06747924585348261, + "language_loss": 0.86233467, + "learning_rate": 0.0007915183943836055, + "loss": 0.87360251, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.18164062, + "step": 1678, + "time_per_iteration": 2.9165165424346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120975, + "balance_loss_mlp": 1.10280752, + "epoch": 0.3230088495575221, + "flos": 781389255168.0, + "grad_norm": 0.11051827421234449, + "language_loss": 0.84204686, + "learning_rate": 0.0007912652265843773, + "loss": 0.85325664, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.18164062, + "step": 1679, + "time_per_iteration": 3.141361713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108875, + "balance_loss_mlp": 1.09056485, + "epoch": 0.3232012312427857, + "flos": 536110824960.0, + "grad_norm": 0.06834343380772315, + "language_loss": 0.81678128, + "learning_rate": 0.0007910119457059597, + "loss": 0.82787001, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.1829834, + "step": 1680, + "time_per_iteration": 2.7235679626464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097161, + "balance_loss_mlp": 1.07836151, + "epoch": 0.32339361292804925, + "flos": 704857946112.0, + "grad_norm": 0.08108919878534793, + "language_loss": 0.80109823, + "learning_rate": 0.0007907585518466849, + "loss": 0.81206989, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.18798828, + "step": 1681, + "time_per_iteration": 2.9778435230255127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096448, + "balance_loss_mlp": 1.07823253, + "epoch": 0.32358599461331283, + "flos": 452330293248.0, + "grad_norm": 0.07179806444318433, + "language_loss": 0.89356047, + "learning_rate": 0.000790505045104929, + "loss": 0.90452492, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.18200684, + "step": 1682, + "time_per_iteration": 2.522502899169922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092596, + "balance_loss_mlp": 1.07453537, + "epoch": 0.32377837629857636, + "flos": 600895729152.0, + "grad_norm": 0.07276753556485034, + "language_loss": 0.86845744, + "learning_rate": 0.0007902514255791125, + "loss": 0.87938344, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.18066406, + "step": 1683, + "time_per_iteration": 2.7951602935791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094831, + "balance_loss_mlp": 1.07612705, + "epoch": 0.32397075798383995, + "flos": 807523250688.0, + "grad_norm": 0.14328213003802046, + "language_loss": 0.87945193, + "learning_rate": 0.0007899976933676986, + "loss": 0.89040023, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.18701172, + "step": 1684, + "time_per_iteration": 3.0410313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095322, + "balance_loss_mlp": 1.0759027, + "epoch": 0.3241631396691035, + "flos": 601689005568.0, + "grad_norm": 0.09505530250353386, + "language_loss": 0.8717491, + "learning_rate": 0.0007897438485691955, + "loss": 0.88270235, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.19396973, + "step": 1685, + "time_per_iteration": 2.717643976211548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109744, + "balance_loss_mlp": 1.09030128, + "epoch": 0.32435552135436707, + "flos": 474219297792.0, + "grad_norm": 0.0737580177172555, + "language_loss": 0.82153177, + "learning_rate": 0.0007894898912821542, + "loss": 0.8326292, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.19433594, + "step": 1686, + "time_per_iteration": 2.529229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103459, + "balance_loss_mlp": 1.0848738, + "epoch": 0.3245479030396306, + "flos": 538102268928.0, + "grad_norm": 0.06566778614017829, + "language_loss": 0.86626494, + "learning_rate": 0.0007892358216051695, + "loss": 0.87729949, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.18566895, + "step": 1687, + "time_per_iteration": 2.7486979961395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103486, + "balance_loss_mlp": 1.08472204, + "epoch": 0.3247402847248942, + "flos": 547654597632.0, + "grad_norm": 0.06759540868164342, + "language_loss": 0.91712224, + "learning_rate": 0.0007889816396368803, + "loss": 0.92815715, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.18737793, + "step": 1688, + "time_per_iteration": 2.6558406352996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114869, + "balance_loss_mlp": 1.09629631, + "epoch": 0.3249326664101578, + "flos": 378151276032.0, + "grad_norm": 0.08904939998236257, + "language_loss": 0.85158062, + "learning_rate": 0.0007887273454759687, + "loss": 0.86272931, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.18566895, + "step": 1689, + "time_per_iteration": 2.4704487323760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120516, + "balance_loss_mlp": 1.10219383, + "epoch": 0.3251250480954213, + "flos": 528078237696.0, + "grad_norm": 0.07572457526068059, + "language_loss": 0.82346898, + "learning_rate": 0.0007884729392211603, + "loss": 0.83467412, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.18322754, + "step": 1690, + "time_per_iteration": 2.703683614730835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110893, + "balance_loss_mlp": 1.09243917, + "epoch": 0.3253174297806849, + "flos": 449659372032.0, + "grad_norm": 0.09550307140961752, + "language_loss": 0.85592222, + "learning_rate": 0.0007882184209712245, + "loss": 0.86703116, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.18444824, + "step": 1691, + "time_per_iteration": 2.560342788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103123, + "balance_loss_mlp": 1.0847528, + "epoch": 0.32550981146594843, + "flos": 704181040128.0, + "grad_norm": 0.06639873617663411, + "language_loss": 0.85215127, + "learning_rate": 0.000787963790824974, + "loss": 0.86318254, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.18371582, + "step": 1692, + "time_per_iteration": 3.01053786277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102989, + "balance_loss_mlp": 1.08483362, + "epoch": 0.325702193151212, + "flos": 392704422912.0, + "grad_norm": 0.0791061376464097, + "language_loss": 0.89282072, + "learning_rate": 0.0007877090488812651, + "loss": 0.90385056, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.18164062, + "step": 1693, + "time_per_iteration": 2.4398083686828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101181, + "balance_loss_mlp": 1.08242917, + "epoch": 0.32589457483647555, + "flos": 577494696960.0, + "grad_norm": 0.07726533895166562, + "language_loss": 0.8386811, + "learning_rate": 0.0007874541952389973, + "loss": 0.84969294, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.1875, + "step": 1694, + "time_per_iteration": 2.6756813526153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104451, + "balance_loss_mlp": 1.08591402, + "epoch": 0.32608695652173914, + "flos": 498339454464.0, + "grad_norm": 0.08042259552829657, + "language_loss": 0.86563015, + "learning_rate": 0.0007871992299971136, + "loss": 0.87667465, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.1854248, + "step": 1695, + "time_per_iteration": 2.5899436473846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114554, + "balance_loss_mlp": 1.096017, + "epoch": 0.32627933820700267, + "flos": 591145910784.0, + "grad_norm": 0.10859344338562153, + "language_loss": 0.84131289, + "learning_rate": 0.0007869441532546001, + "loss": 0.85245848, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.18530273, + "step": 1696, + "time_per_iteration": 2.7561304569244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107684, + "balance_loss_mlp": 1.08946884, + "epoch": 0.32647171989226625, + "flos": 609086532096.0, + "grad_norm": 0.10465149109525512, + "language_loss": 0.79480183, + "learning_rate": 0.0007866889651104867, + "loss": 0.8058787, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.18225098, + "step": 1697, + "time_per_iteration": 2.8031740188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108686, + "balance_loss_mlp": 1.08992255, + "epoch": 0.32666410157752984, + "flos": 477154520064.0, + "grad_norm": 0.0906406666849178, + "language_loss": 0.83109629, + "learning_rate": 0.000786433665663846, + "loss": 0.84218317, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.18762207, + "step": 1698, + "time_per_iteration": 2.6932730674743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106374, + "balance_loss_mlp": 1.08788502, + "epoch": 0.3268564832627934, + "flos": 718385822208.0, + "grad_norm": 0.09684550827651525, + "language_loss": 0.86934984, + "learning_rate": 0.0007861782550137942, + "loss": 0.88041353, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.18481445, + "step": 1699, + "time_per_iteration": 2.924246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111141, + "balance_loss_mlp": 1.09345734, + "epoch": 0.32704886494805696, + "flos": 768795618816.0, + "grad_norm": 0.08559105168392155, + "language_loss": 0.85866642, + "learning_rate": 0.0007859227332594901, + "loss": 0.86978048, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.17956543, + "step": 1700, + "time_per_iteration": 2.930842876434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106023, + "balance_loss_mlp": 1.0883081, + "epoch": 0.3272412466333205, + "flos": 849912569856.0, + "grad_norm": 0.09580530814462011, + "language_loss": 0.84299338, + "learning_rate": 0.0007856671005001365, + "loss": 0.85405362, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.17712402, + "step": 1701, + "time_per_iteration": 3.2081515789031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110257, + "balance_loss_mlp": 1.09185123, + "epoch": 0.3274336283185841, + "flos": 831586507776.0, + "grad_norm": 0.08565892816740808, + "language_loss": 0.81811458, + "learning_rate": 0.0007854113568349787, + "loss": 0.8292172, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.18408203, + "step": 1702, + "time_per_iteration": 3.1229259967803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107177, + "balance_loss_mlp": 1.08861589, + "epoch": 0.3276260100038476, + "flos": 692027172864.0, + "grad_norm": 0.07794566968546403, + "language_loss": 0.80742395, + "learning_rate": 0.0007851555023633052, + "loss": 0.81849575, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.18554688, + "step": 1703, + "time_per_iteration": 2.87683367729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093514, + "balance_loss_mlp": 1.07504809, + "epoch": 0.3278183916891112, + "flos": 436059915264.0, + "grad_norm": 0.08579630919656539, + "language_loss": 0.82316363, + "learning_rate": 0.0007848995371844474, + "loss": 0.83409876, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.18469238, + "step": 1704, + "time_per_iteration": 2.543123483657837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108458, + "balance_loss_mlp": 1.09000456, + "epoch": 0.3280107733743748, + "flos": 461109169152.0, + "grad_norm": 0.08180134109500492, + "language_loss": 0.80497056, + "learning_rate": 0.0007846434613977801, + "loss": 0.81605512, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.18444824, + "step": 1705, + "time_per_iteration": 2.5694901943206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096099, + "balance_loss_mlp": 1.07726395, + "epoch": 0.3282031550596383, + "flos": 679319737344.0, + "grad_norm": 0.08642702147252447, + "language_loss": 0.7816267, + "learning_rate": 0.0007843872751027203, + "loss": 0.79258776, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.18835449, + "step": 1706, + "time_per_iteration": 2.8476855754852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091232, + "balance_loss_mlp": 1.07206345, + "epoch": 0.3283955367449019, + "flos": 545107014144.0, + "grad_norm": 0.07466271413415602, + "language_loss": 0.87096149, + "learning_rate": 0.0007841309783987287, + "loss": 0.88187379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.19152832, + "step": 1707, + "time_per_iteration": 2.752048969268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.0709219, + "epoch": 0.32858791843016544, + "flos": 481261118976.0, + "grad_norm": 0.08448532304164387, + "language_loss": 0.8909331, + "learning_rate": 0.0007838745713853084, + "loss": 0.90183651, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.1940918, + "step": 1708, + "time_per_iteration": 2.576037883758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085126, + "balance_loss_mlp": 1.06595731, + "epoch": 0.328780300115429, + "flos": 566805869568.0, + "grad_norm": 0.08173004229220915, + "language_loss": 0.84132832, + "learning_rate": 0.0007836180541620053, + "loss": 0.85217953, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.19152832, + "step": 1709, + "time_per_iteration": 2.7169644832611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084228, + "balance_loss_mlp": 1.06489253, + "epoch": 0.32897268180069256, + "flos": 476027933184.0, + "grad_norm": 0.09936539185168088, + "language_loss": 0.86458898, + "learning_rate": 0.0007833614268284082, + "loss": 0.8754313, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.19311523, + "step": 1710, + "time_per_iteration": 2.5532357692718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0119417, + "balance_loss_mlp": 1.17919695, + "epoch": 0.32916506348595614, + "flos": 1577301548544.0, + "grad_norm": 0.0502772245871811, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75303936, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.14941406, + "step": 1711, + "time_per_iteration": 4.93800163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084968, + "balance_loss_mlp": 1.06610942, + "epoch": 0.3293574451712197, + "flos": 482886945792.0, + "grad_norm": 0.0930127101012754, + "language_loss": 0.78468674, + "learning_rate": 0.0007828478422289016, + "loss": 0.7955364, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.18835449, + "step": 1712, + "time_per_iteration": 2.6106202602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094707, + "balance_loss_mlp": 1.0755266, + "epoch": 0.32954982685648326, + "flos": 622557508608.0, + "grad_norm": 0.07722441463790092, + "language_loss": 0.88823062, + "learning_rate": 0.0007825908851623833, + "loss": 0.89917773, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.19165039, + "step": 1713, + "time_per_iteration": 2.7708652019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099793, + "balance_loss_mlp": 1.08030224, + "epoch": 0.32974220854174685, + "flos": 544971193344.0, + "grad_norm": 0.08538102567636462, + "language_loss": 0.84563339, + "learning_rate": 0.0007823338183843533, + "loss": 0.85663128, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.19482422, + "step": 1714, + "time_per_iteration": 2.6919374465942383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101813, + "balance_loss_mlp": 1.08302569, + "epoch": 0.3299345902270104, + "flos": 982155870720.0, + "grad_norm": 0.10472435712491576, + "language_loss": 0.80579829, + "learning_rate": 0.0007820766419946141, + "loss": 0.81681645, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.18762207, + "step": 1715, + "time_per_iteration": 3.3962650299072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133772, + "balance_loss_mlp": 1.12051618, + "epoch": 0.33012697191227397, + "flos": 1403664090624.0, + "grad_norm": 0.022367363269540627, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80806249, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.1328125, + "step": 1716, + "time_per_iteration": 4.940594434738159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117089, + "balance_loss_mlp": 1.0989933, + "epoch": 0.3303193535975375, + "flos": 505151479296.0, + "grad_norm": 0.07989503427724588, + "language_loss": 0.7557565, + "learning_rate": 0.0007815619607794288, + "loss": 0.76692742, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.1809082, + "step": 1717, + "time_per_iteration": 2.6619300842285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112079, + "balance_loss_mlp": 1.10175252, + "epoch": 0.3305117352828011, + "flos": 937977739776.0, + "grad_norm": 0.08732146715249756, + "language_loss": 0.82213569, + "learning_rate": 0.0007813044561538001, + "loss": 0.83334363, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.19030762, + "step": 1718, + "time_per_iteration": 3.146427869796753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118808, + "balance_loss_mlp": 1.0996747, + "epoch": 0.3307041169680646, + "flos": 721499083776.0, + "grad_norm": 0.07987567281751332, + "language_loss": 0.88114393, + "learning_rate": 0.0007810468423160958, + "loss": 0.89233208, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.19128418, + "step": 1719, + "time_per_iteration": 2.882783889770508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116955, + "balance_loss_mlp": 1.09883487, + "epoch": 0.3308964986533282, + "flos": 583614761472.0, + "grad_norm": 0.07516231806962957, + "language_loss": 0.81837869, + "learning_rate": 0.0007807891193663306, + "loss": 0.82954824, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.18127441, + "step": 1720, + "time_per_iteration": 2.817091464996338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115372, + "balance_loss_mlp": 1.09681106, + "epoch": 0.33108888033859174, + "flos": 473576896512.0, + "grad_norm": 0.08207921946386207, + "language_loss": 0.82360268, + "learning_rate": 0.0007805312874045614, + "loss": 0.83475637, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.18566895, + "step": 1721, + "time_per_iteration": 2.5788111686706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127428, + "balance_loss_mlp": 1.10856915, + "epoch": 0.3312812620238553, + "flos": 386129534976.0, + "grad_norm": 0.08587725731854692, + "language_loss": 0.86701787, + "learning_rate": 0.0007802733465308874, + "loss": 0.87829208, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.18847656, + "step": 1722, + "time_per_iteration": 2.47092866897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134903, + "balance_loss_mlp": 1.11681938, + "epoch": 0.3314736437091189, + "flos": 494554056192.0, + "grad_norm": 0.07875115394989439, + "language_loss": 0.84537411, + "learning_rate": 0.0007800152968454501, + "loss": 0.85672319, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.1809082, + "step": 1723, + "time_per_iteration": 2.689821481704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134288, + "balance_loss_mlp": 1.1161443, + "epoch": 0.33166602539438245, + "flos": 653662586880.0, + "grad_norm": 0.07553816314554183, + "language_loss": 0.90259147, + "learning_rate": 0.0007797571384484334, + "loss": 0.91393435, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.18139648, + "step": 1724, + "time_per_iteration": 2.881140947341919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130284, + "balance_loss_mlp": 1.11211705, + "epoch": 0.33185840707964603, + "flos": 520806620160.0, + "grad_norm": 0.09124178304656469, + "language_loss": 0.91919303, + "learning_rate": 0.0007794988714400633, + "loss": 0.93049586, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.18164062, + "step": 1725, + "time_per_iteration": 2.6405282020568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127051, + "balance_loss_mlp": 1.10823941, + "epoch": 0.33205078876490957, + "flos": 436949365248.0, + "grad_norm": 0.08426272849970545, + "language_loss": 0.85092092, + "learning_rate": 0.0007792404959206079, + "loss": 0.8621915, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.18798828, + "step": 1726, + "time_per_iteration": 2.5432610511779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127088, + "balance_loss_mlp": 1.1084559, + "epoch": 0.33224317045017315, + "flos": 768738719232.0, + "grad_norm": 0.07425680572728817, + "language_loss": 0.81119555, + "learning_rate": 0.0007789820119903774, + "loss": 0.82246637, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.1862793, + "step": 1727, + "time_per_iteration": 3.032222270965576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139545, + "balance_loss_mlp": 1.12562108, + "epoch": 0.3324355521354367, + "flos": 1466381574144.0, + "grad_norm": 0.028014537923784853, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79632211, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.13964844, + "step": 1728, + "time_per_iteration": 4.8402745723724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136807, + "balance_loss_mlp": 1.11797178, + "epoch": 0.3326279338207003, + "flos": 496691232768.0, + "grad_norm": 0.0895292490434253, + "language_loss": 0.8341223, + "learning_rate": 0.0007784647192990428, + "loss": 0.84549034, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.18798828, + "step": 1729, + "time_per_iteration": 2.732290267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138805, + "balance_loss_mlp": 1.11925435, + "epoch": 0.33282031550596386, + "flos": 635890093056.0, + "grad_norm": 0.13711052560491443, + "language_loss": 0.80506217, + "learning_rate": 0.0007782059107387696, + "loss": 0.81645024, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.1953125, + "step": 1730, + "time_per_iteration": 2.8793182373046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114255, + "balance_loss_mlp": 1.12199879, + "epoch": 0.3330126971912274, + "flos": 689511896064.0, + "grad_norm": 0.08825875418673053, + "language_loss": 0.8822093, + "learning_rate": 0.0007779469941693826, + "loss": 0.8936348, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.20556641, + "step": 1731, + "time_per_iteration": 2.862053632736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136737, + "balance_loss_mlp": 1.11668622, + "epoch": 0.333205078876491, + "flos": 566457504768.0, + "grad_norm": 0.0849632369239172, + "language_loss": 0.77099073, + "learning_rate": 0.0007776879696914029, + "loss": 0.78235817, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.20043945, + "step": 1732, + "time_per_iteration": 2.878997325897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137118, + "balance_loss_mlp": 1.11639929, + "epoch": 0.3333974605617545, + "flos": 640927987200.0, + "grad_norm": 0.06630968591745413, + "language_loss": 0.88863558, + "learning_rate": 0.000777428837405392, + "loss": 0.90000677, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.20715332, + "step": 1733, + "time_per_iteration": 2.849579095840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113514, + "balance_loss_mlp": 1.1140877, + "epoch": 0.3335898422470181, + "flos": 461833062912.0, + "grad_norm": 0.1678685499329745, + "language_loss": 0.86820018, + "learning_rate": 0.0007771695974119544, + "loss": 0.87955153, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.21069336, + "step": 1734, + "time_per_iteration": 2.5213568210601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011438, + "balance_loss_mlp": 1.12223458, + "epoch": 0.33378222393228163, + "flos": 852870187008.0, + "grad_norm": 0.07580918658919847, + "language_loss": 0.75353694, + "learning_rate": 0.0007769102498117359, + "loss": 0.76497495, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.21569824, + "step": 1735, + "time_per_iteration": 3.1764426231384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152046, + "balance_loss_mlp": 1.12946832, + "epoch": 0.3339746056175452, + "flos": 954665491968.0, + "grad_norm": 0.07940235688963863, + "language_loss": 0.79215956, + "learning_rate": 0.000776650794705424, + "loss": 0.80368006, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.22570801, + "step": 1736, + "time_per_iteration": 3.311570644378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150562, + "balance_loss_mlp": 1.12822187, + "epoch": 0.33416698730280875, + "flos": 544825460736.0, + "grad_norm": 0.07154101803961593, + "language_loss": 0.82120311, + "learning_rate": 0.0007763912321937483, + "loss": 0.83270872, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.22351074, + "step": 1737, + "time_per_iteration": 2.7742059230804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162448, + "balance_loss_mlp": 1.14046574, + "epoch": 0.33435936898807234, + "flos": 1014096070656.0, + "grad_norm": 0.09893982821491046, + "language_loss": 0.82392818, + "learning_rate": 0.0007761315623774799, + "loss": 0.83555263, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.21972656, + "step": 1738, + "time_per_iteration": 3.4311513900756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158796, + "balance_loss_mlp": 1.1368016, + "epoch": 0.3345517506733359, + "flos": 615221650944.0, + "grad_norm": 0.09029538875627986, + "language_loss": 0.87794083, + "learning_rate": 0.0007758717853574313, + "loss": 0.88952881, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.22009277, + "step": 1739, + "time_per_iteration": 2.771195411682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165102, + "balance_loss_mlp": 1.14437175, + "epoch": 0.33474413235859946, + "flos": 494593703424.0, + "grad_norm": 0.0906994231642372, + "language_loss": 0.89945674, + "learning_rate": 0.0007756119012344571, + "loss": 0.91110778, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.20739746, + "step": 1740, + "time_per_iteration": 2.60304594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150213, + "balance_loss_mlp": 1.12998307, + "epoch": 0.33493651404386304, + "flos": 628381338624.0, + "grad_norm": 0.09292231464176055, + "language_loss": 0.8424325, + "learning_rate": 0.0007753519101094535, + "loss": 0.85393465, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.20227051, + "step": 1741, + "time_per_iteration": 2.763831377029419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130901, + "balance_loss_mlp": 1.11101699, + "epoch": 0.3351288957291266, + "flos": 513727723008.0, + "grad_norm": 0.09107418087972757, + "language_loss": 0.86003816, + "learning_rate": 0.0007750918120833575, + "loss": 0.87134719, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.19873047, + "step": 1742, + "time_per_iteration": 2.5983192920684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110821, + "balance_loss_mlp": 1.08914852, + "epoch": 0.33532127741439016, + "flos": 647302814208.0, + "grad_norm": 0.08951756084527424, + "language_loss": 0.86919558, + "learning_rate": 0.0007748316072571485, + "loss": 0.88027763, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.19042969, + "step": 1743, + "time_per_iteration": 2.826857328414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096104, + "balance_loss_mlp": 1.07641089, + "epoch": 0.3355136590996537, + "flos": 768464506368.0, + "grad_norm": 0.07101368717418235, + "language_loss": 0.78953618, + "learning_rate": 0.0007745712957318467, + "loss": 0.80049723, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.19677734, + "step": 1744, + "time_per_iteration": 2.9848310947418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099368, + "balance_loss_mlp": 1.08075917, + "epoch": 0.3357060407849173, + "flos": 595536634368.0, + "grad_norm": 0.06728871536655502, + "language_loss": 0.86402392, + "learning_rate": 0.0007743108776085141, + "loss": 0.87501758, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.18603516, + "step": 1745, + "time_per_iteration": 2.7903690338134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100918, + "balance_loss_mlp": 1.08167791, + "epoch": 0.3358984224701808, + "flos": 598590425088.0, + "grad_norm": 0.08105774730722601, + "language_loss": 0.83074069, + "learning_rate": 0.0007740503529882543, + "loss": 0.84174985, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.19238281, + "step": 1746, + "time_per_iteration": 2.8164098262786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102514, + "balance_loss_mlp": 1.08327341, + "epoch": 0.3360908041554444, + "flos": 578329818624.0, + "grad_norm": 0.08939656691142209, + "language_loss": 0.90720791, + "learning_rate": 0.0007737897219722114, + "loss": 0.91823304, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.19226074, + "step": 1747, + "time_per_iteration": 2.682877540588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098067, + "balance_loss_mlp": 1.07800448, + "epoch": 0.336283185840708, + "flos": 513589330944.0, + "grad_norm": 0.08976840313873562, + "language_loss": 0.81010032, + "learning_rate": 0.0007735289846615716, + "loss": 0.82108104, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.20068359, + "step": 1748, + "time_per_iteration": 2.687856674194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096768, + "balance_loss_mlp": 1.07715857, + "epoch": 0.3364755675259715, + "flos": 524974887936.0, + "grad_norm": 0.08605901070846078, + "language_loss": 0.81949353, + "learning_rate": 0.0007732681411575621, + "loss": 0.83046126, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.19616699, + "step": 1749, + "time_per_iteration": 2.711014747619629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100357, + "balance_loss_mlp": 1.08002043, + "epoch": 0.3366679492112351, + "flos": 554869315584.0, + "grad_norm": 0.0865041685268045, + "language_loss": 0.87347746, + "learning_rate": 0.0007730071915614514, + "loss": 0.88448107, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.20349121, + "step": 1750, + "time_per_iteration": 2.7877442836761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097656, + "balance_loss_mlp": 1.07754588, + "epoch": 0.33686033089649864, + "flos": 427273698816.0, + "grad_norm": 0.099917727371098, + "language_loss": 0.88751096, + "learning_rate": 0.0007727461359745489, + "loss": 0.89848751, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.2010498, + "step": 1751, + "time_per_iteration": 2.5344979763031006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110101, + "balance_loss_mlp": 1.09051538, + "epoch": 0.3370527125817622, + "flos": 541729451520.0, + "grad_norm": 0.06874041131201088, + "language_loss": 0.85970122, + "learning_rate": 0.0007724849744982056, + "loss": 0.87080222, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.19592285, + "step": 1752, + "time_per_iteration": 2.7278292179107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118351, + "balance_loss_mlp": 1.09820437, + "epoch": 0.33724509426702576, + "flos": 542114892288.0, + "grad_norm": 0.07532767444648983, + "language_loss": 0.81245279, + "learning_rate": 0.0007722237072338131, + "loss": 0.82363629, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.20141602, + "step": 1753, + "time_per_iteration": 2.715123414993286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129336, + "balance_loss_mlp": 1.10946393, + "epoch": 0.33743747595228935, + "flos": 472796103168.0, + "grad_norm": 0.09907858659655516, + "language_loss": 0.85174322, + "learning_rate": 0.0007719623342828046, + "loss": 0.86303657, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.1986084, + "step": 1754, + "time_per_iteration": 2.580603837966919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011433, + "balance_loss_mlp": 1.12336826, + "epoch": 0.33762985763755293, + "flos": 469818662400.0, + "grad_norm": 0.09468217220840029, + "language_loss": 0.84008503, + "learning_rate": 0.000771700855746654, + "loss": 0.85151798, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.19934082, + "step": 1755, + "time_per_iteration": 2.6360206604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115036, + "balance_loss_mlp": 1.13060665, + "epoch": 0.33782223932281646, + "flos": 492251323392.0, + "grad_norm": 0.06173278613548714, + "language_loss": 0.8813622, + "learning_rate": 0.0007714392717268763, + "loss": 0.89286578, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.19750977, + "step": 1756, + "time_per_iteration": 2.610471725463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169557, + "balance_loss_mlp": 1.14999521, + "epoch": 0.33801462100808005, + "flos": 465064892928.0, + "grad_norm": 0.08560719953811556, + "language_loss": 0.86437309, + "learning_rate": 0.0007711775823250273, + "loss": 0.87606871, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.19555664, + "step": 1757, + "time_per_iteration": 2.5406768321990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179489, + "balance_loss_mlp": 1.16010547, + "epoch": 0.3382070026933436, + "flos": 795668189184.0, + "grad_norm": 0.06814979795763555, + "language_loss": 0.82866555, + "learning_rate": 0.0007709157876427039, + "loss": 0.84046042, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.19372559, + "step": 1758, + "time_per_iteration": 3.144188642501831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152293, + "balance_loss_mlp": 1.13320732, + "epoch": 0.33839938437860717, + "flos": 508430297088.0, + "grad_norm": 0.08381425857535812, + "language_loss": 0.85356963, + "learning_rate": 0.0007706538877815439, + "loss": 0.86509264, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.1907959, + "step": 1759, + "time_per_iteration": 2.6544251441955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145804, + "balance_loss_mlp": 1.12751722, + "epoch": 0.3385917660638707, + "flos": 484243329024.0, + "grad_norm": 0.07160952497477109, + "language_loss": 0.83250809, + "learning_rate": 0.0007703918828432259, + "loss": 0.84396613, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.18273926, + "step": 1760, + "time_per_iteration": 2.639800548553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139561, + "balance_loss_mlp": 1.12061834, + "epoch": 0.3387841477491343, + "flos": 545339381760.0, + "grad_norm": 0.07528387784347967, + "language_loss": 0.89063478, + "learning_rate": 0.000770129772929469, + "loss": 0.90203035, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.18933105, + "step": 1761, + "time_per_iteration": 2.690807580947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143571, + "balance_loss_mlp": 1.12493849, + "epoch": 0.3389765294343978, + "flos": 719801676288.0, + "grad_norm": 0.07941213480930635, + "language_loss": 0.87791038, + "learning_rate": 0.0007698675581420334, + "loss": 0.88934612, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.18615723, + "step": 1762, + "time_per_iteration": 2.897935390472412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135098, + "balance_loss_mlp": 1.11646509, + "epoch": 0.3391689111196614, + "flos": 699928708608.0, + "grad_norm": 0.08353352960784785, + "language_loss": 0.78453314, + "learning_rate": 0.0007696052385827199, + "loss": 0.79588407, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.18603516, + "step": 1763, + "time_per_iteration": 2.960893154144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144695, + "balance_loss_mlp": 1.12652755, + "epoch": 0.339361292804925, + "flos": 627093964800.0, + "grad_norm": 0.0785374693184301, + "language_loss": 0.77934641, + "learning_rate": 0.00076934281435337, + "loss": 0.7907933, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.18188477, + "step": 1764, + "time_per_iteration": 2.8066813945770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131427, + "balance_loss_mlp": 1.11263931, + "epoch": 0.33955367449018853, + "flos": 609600453120.0, + "grad_norm": 0.11428683327792583, + "language_loss": 0.86483157, + "learning_rate": 0.0007690802855558658, + "loss": 0.87614584, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.18762207, + "step": 1765, + "time_per_iteration": 2.9382381439208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097374, + "balance_loss_mlp": 1.08335495, + "epoch": 0.3397460561754521, + "flos": 1453310346240.0, + "grad_norm": 0.038046821471630334, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77472329, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.140625, + "step": 1766, + "time_per_iteration": 4.939141750335693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131665, + "balance_loss_mlp": 1.11155438, + "epoch": 0.33993843786071565, + "flos": 487312174080.0, + "grad_norm": 0.1972001158351392, + "language_loss": 0.89103919, + "learning_rate": 0.0007685549146641262, + "loss": 0.90235579, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.20117188, + "step": 1767, + "time_per_iteration": 2.596677780151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113815, + "balance_loss_mlp": 1.11898088, + "epoch": 0.34013081954597923, + "flos": 417338500608.0, + "grad_norm": 0.0754052007703104, + "language_loss": 0.87994409, + "learning_rate": 0.0007682920727738579, + "loss": 0.89132559, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.19152832, + "step": 1768, + "time_per_iteration": 2.572606325149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011476, + "balance_loss_mlp": 1.12763298, + "epoch": 0.34032320123124277, + "flos": 437520185856.0, + "grad_norm": 0.09008834675764238, + "language_loss": 0.84476101, + "learning_rate": 0.000768029126723369, + "loss": 0.85623699, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.19958496, + "step": 1769, + "time_per_iteration": 2.517974615097046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117614, + "balance_loss_mlp": 1.15621972, + "epoch": 0.34051558291650635, + "flos": 457590643200.0, + "grad_norm": 0.08324416055939475, + "language_loss": 0.81926113, + "learning_rate": 0.0007677660766147447, + "loss": 0.83102256, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.19909668, + "step": 1770, + "time_per_iteration": 2.525979518890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113196, + "balance_loss_mlp": 1.0996542, + "epoch": 0.3407079646017699, + "flos": 1558849204224.0, + "grad_norm": 0.058076344856887535, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.73584139, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.13574219, + "step": 1771, + "time_per_iteration": 4.954227924346924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01208192, + "balance_loss_mlp": 1.18773556, + "epoch": 0.3409003462870335, + "flos": 492555271680.0, + "grad_norm": 0.12544773614524246, + "language_loss": 0.79168922, + "learning_rate": 0.0007672396646316306, + "loss": 0.80377114, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.20446777, + "step": 1772, + "time_per_iteration": 2.5573487281799316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01184404, + "balance_loss_mlp": 1.1633631, + "epoch": 0.34109272797229706, + "flos": 808479512064.0, + "grad_norm": 0.0812632702006711, + "language_loss": 0.80576169, + "learning_rate": 0.000766976302961512, + "loss": 0.81760573, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.21057129, + "step": 1773, + "time_per_iteration": 2.9981236457824707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174902, + "balance_loss_mlp": 1.15440965, + "epoch": 0.3412851096575606, + "flos": 470142434304.0, + "grad_norm": 0.08509012237921207, + "language_loss": 0.81078374, + "learning_rate": 0.0007667128376420003, + "loss": 0.82253277, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.20495605, + "step": 1774, + "time_per_iteration": 2.6422817707061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141783, + "balance_loss_mlp": 1.12135017, + "epoch": 0.3414774913428242, + "flos": 595675026432.0, + "grad_norm": 0.07609688435085656, + "language_loss": 0.84329826, + "learning_rate": 0.0007664492687753817, + "loss": 0.85471606, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.2043457, + "step": 1775, + "time_per_iteration": 2.719444513320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133962, + "balance_loss_mlp": 1.11357749, + "epoch": 0.3416698730280877, + "flos": 527463000576.0, + "grad_norm": 0.0684007600896635, + "language_loss": 0.81250805, + "learning_rate": 0.000766185596463983, + "loss": 0.82384765, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.20397949, + "step": 1776, + "time_per_iteration": 2.641289472579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118215, + "balance_loss_mlp": 1.09844995, + "epoch": 0.3418622547133513, + "flos": 874640623104.0, + "grad_norm": 0.08848921826202948, + "language_loss": 0.76858222, + "learning_rate": 0.0007659218208101706, + "loss": 0.77976441, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.19750977, + "step": 1777, + "time_per_iteration": 3.121042490005493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111411, + "balance_loss_mlp": 1.09507275, + "epoch": 0.34205463639861483, + "flos": 603744689664.0, + "grad_norm": 0.06446172596419028, + "language_loss": 0.84679043, + "learning_rate": 0.0007656579419163515, + "loss": 0.85793149, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.19018555, + "step": 1778, + "time_per_iteration": 2.8044042587280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115799, + "balance_loss_mlp": 1.09639132, + "epoch": 0.3422470180838784, + "flos": 463780090368.0, + "grad_norm": 0.08419061749659096, + "language_loss": 0.7684586, + "learning_rate": 0.0007653939598849724, + "loss": 0.77961665, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.19396973, + "step": 1779, + "time_per_iteration": 2.5383636951446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090316, + "balance_loss_mlp": 1.07667828, + "epoch": 0.34243939976914195, + "flos": 1586428416000.0, + "grad_norm": 0.04688573866990776, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83970523, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.13671875, + "step": 1780, + "time_per_iteration": 4.939146041870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100725, + "balance_loss_mlp": 1.0817349, + "epoch": 0.34263178145440554, + "flos": 873017367552.0, + "grad_norm": 0.09328427377426286, + "language_loss": 0.7993626, + "learning_rate": 0.000764865686819522, + "loss": 0.81036985, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.18969727, + "step": 1781, + "time_per_iteration": 3.0855140686035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097786, + "balance_loss_mlp": 1.07818818, + "epoch": 0.3428241631396691, + "flos": 506878622208.0, + "grad_norm": 0.0784117519331498, + "language_loss": 0.85829425, + "learning_rate": 0.0007646013959905449, + "loss": 0.86927211, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.19592285, + "step": 1782, + "time_per_iteration": 2.6008715629577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094184, + "balance_loss_mlp": 1.07484865, + "epoch": 0.34301654482493266, + "flos": 880039365120.0, + "grad_norm": 0.10020930760951015, + "language_loss": 0.80767882, + "learning_rate": 0.0007643370024341949, + "loss": 0.81862062, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.19311523, + "step": 1783, + "time_per_iteration": 3.1744794845581055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093606, + "balance_loss_mlp": 1.0741868, + "epoch": 0.34320892651019624, + "flos": 431763167232.0, + "grad_norm": 0.06177623901241128, + "language_loss": 0.82775044, + "learning_rate": 0.0007640725062531195, + "loss": 0.83868653, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.19396973, + "step": 1784, + "time_per_iteration": 2.5207273960113525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095739, + "balance_loss_mlp": 1.07624829, + "epoch": 0.3434013081954598, + "flos": 463641698304.0, + "grad_norm": 0.07609738057692413, + "language_loss": 0.86137176, + "learning_rate": 0.0007638079075500047, + "loss": 0.87232918, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.19482422, + "step": 1785, + "time_per_iteration": 2.6027305126190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_mlp": 1.02909327, + "epoch": 0.34359368988072336, + "flos": 1557332034048.0, + "grad_norm": 0.02730093024075542, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76222348, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.12597656, + "step": 1786, + "time_per_iteration": 4.981709718704224 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123604, + "balance_loss_mlp": 1.10412502, + "epoch": 0.3437860715659869, + "flos": 495527569920.0, + "grad_norm": 0.0828485615256838, + "language_loss": 0.82775986, + "learning_rate": 0.0007632784029886026, + "loss": 0.83899587, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.19470215, + "step": 1787, + "time_per_iteration": 2.6825647354125977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121547, + "balance_loss_mlp": 1.10167432, + "epoch": 0.3439784532512505, + "flos": 718274594304.0, + "grad_norm": 0.06541046205818803, + "language_loss": 0.84959292, + "learning_rate": 0.0007630134973358873, + "loss": 0.86080837, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.1986084, + "step": 1788, + "time_per_iteration": 3.0164642333984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112959, + "balance_loss_mlp": 1.11006355, + "epoch": 0.34417083493651407, + "flos": 565862091264.0, + "grad_norm": 0.07128557935976318, + "language_loss": 0.86626679, + "learning_rate": 0.0007627484895722763, + "loss": 0.8775627, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.1953125, + "step": 1789, + "time_per_iteration": 2.7014718055725098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134771, + "balance_loss_mlp": 1.11494648, + "epoch": 0.3443632166217776, + "flos": 796330414080.0, + "grad_norm": 0.08217230393347356, + "language_loss": 0.80139697, + "learning_rate": 0.0007624833798006552, + "loss": 0.81274474, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.19812012, + "step": 1790, + "time_per_iteration": 3.0889768600463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130993, + "balance_loss_mlp": 1.11054873, + "epoch": 0.3445555983070412, + "flos": 569313805824.0, + "grad_norm": 0.08452412416329605, + "language_loss": 0.83807981, + "learning_rate": 0.0007622181681239483, + "loss": 0.84938967, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.20446777, + "step": 1791, + "time_per_iteration": 2.668236017227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126724, + "balance_loss_mlp": 1.10656524, + "epoch": 0.3447479799923047, + "flos": 568814565888.0, + "grad_norm": 0.06876002435899166, + "language_loss": 0.84450197, + "learning_rate": 0.0007619528546451202, + "loss": 0.85576922, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.20153809, + "step": 1792, + "time_per_iteration": 2.820676326751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121181, + "balance_loss_mlp": 1.10096347, + "epoch": 0.3449403616775683, + "flos": 967723863552.0, + "grad_norm": 0.0839228841992506, + "language_loss": 0.83888298, + "learning_rate": 0.0007616874394671745, + "loss": 0.8500948, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.20214844, + "step": 1793, + "time_per_iteration": 3.339189291000366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121499, + "balance_loss_mlp": 1.10161519, + "epoch": 0.34513274336283184, + "flos": 568607164416.0, + "grad_norm": 0.08136840273622996, + "language_loss": 0.84983474, + "learning_rate": 0.0007614219226931547, + "loss": 0.86104971, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.19873047, + "step": 1794, + "time_per_iteration": 2.7227368354797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129707, + "balance_loss_mlp": 1.10958409, + "epoch": 0.3453251250480954, + "flos": 460943612928.0, + "grad_norm": 0.09590444489475901, + "language_loss": 0.84532511, + "learning_rate": 0.0007611563044261435, + "loss": 0.85662222, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.2010498, + "step": 1795, + "time_per_iteration": 2.546884536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125004, + "balance_loss_mlp": 1.10475039, + "epoch": 0.34551750673335896, + "flos": 415621269504.0, + "grad_norm": 0.0814281657370807, + "language_loss": 0.86456835, + "learning_rate": 0.0007608905847692631, + "loss": 0.87581837, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.20251465, + "step": 1796, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116976, + "balance_loss_mlp": 1.0972116, + "epoch": 0.34570988841862255, + "flos": 587854609920.0, + "grad_norm": 0.08445523119956015, + "language_loss": 0.86433315, + "learning_rate": 0.0007606247638256749, + "loss": 0.87550294, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.19750977, + "step": 1797, + "time_per_iteration": 2.8908944129943848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_mlp": 1.03016257, + "epoch": 0.34590227010388613, + "flos": 1567694518272.0, + "grad_norm": 0.0206101242754925, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79212284, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11816406, + "step": 1798, + "time_per_iteration": 4.959855079650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037793, + "balance_loss_mlp": 1.02591991, + "epoch": 0.34609465178914967, + "flos": 1537743564288.0, + "grad_norm": 0.018708496865608985, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80365002, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11865234, + "step": 1799, + "time_per_iteration": 4.7935545444488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129126, + "balance_loss_mlp": 1.10934877, + "epoch": 0.34628703347441325, + "flos": 609363316224.0, + "grad_norm": 0.08973397272803926, + "language_loss": 0.85623878, + "learning_rate": 0.0007598266943068686, + "loss": 0.86753011, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.19763184, + "step": 1800, + "time_per_iteration": 2.8019869327545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112277, + "balance_loss_mlp": 1.10252821, + "epoch": 0.3464794151596768, + "flos": 473319936000.0, + "grad_norm": 0.0674943248051881, + "language_loss": 0.83542264, + "learning_rate": 0.0007595604692488507, + "loss": 0.84665036, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.20239258, + "step": 1801, + "time_per_iteration": 2.6360082626342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126397, + "balance_loss_mlp": 1.10636973, + "epoch": 0.34667179684494037, + "flos": 605681805312.0, + "grad_norm": 0.06909713253641608, + "language_loss": 0.82839429, + "learning_rate": 0.0007592941434205215, + "loss": 0.83965826, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.20031738, + "step": 1802, + "time_per_iteration": 2.8132333755493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015999, + "balance_loss_mlp": 1.0041256, + "epoch": 0.3468641785302039, + "flos": 1564912369152.0, + "grad_norm": 0.010015114509230977, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74587059, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.11865234, + "step": 1803, + "time_per_iteration": 5.086339950561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104785, + "balance_loss_mlp": 1.08531845, + "epoch": 0.3470565602154675, + "flos": 907265442816.0, + "grad_norm": 0.07426270940157376, + "language_loss": 0.80069757, + "learning_rate": 0.0007587611898665566, + "loss": 0.81174541, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.19458008, + "step": 1804, + "time_per_iteration": 3.092641592025757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110039, + "balance_loss_mlp": 1.0910604, + "epoch": 0.347248941900731, + "flos": 638902038528.0, + "grad_norm": 0.07581928055471668, + "language_loss": 0.81691384, + "learning_rate": 0.0007584945623478315, + "loss": 0.82801425, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.18969727, + "step": 1805, + "time_per_iteration": 2.846060037612915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104021, + "balance_loss_mlp": 1.08541238, + "epoch": 0.3474413235859946, + "flos": 847362788352.0, + "grad_norm": 0.07473751481828116, + "language_loss": 0.80751228, + "learning_rate": 0.000758227834472617, + "loss": 0.81855249, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.18603516, + "step": 1806, + "time_per_iteration": 3.0771524906158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111397, + "balance_loss_mlp": 1.09499145, + "epoch": 0.3476337052712582, + "flos": 515654926848.0, + "grad_norm": 0.07117533522239076, + "language_loss": 0.77160984, + "learning_rate": 0.0007579610063444664, + "loss": 0.78274959, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.18969727, + "step": 1807, + "time_per_iteration": 2.765228509902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104162, + "balance_loss_mlp": 1.08548236, + "epoch": 0.34782608695652173, + "flos": 913551063552.0, + "grad_norm": 0.0766054024965894, + "language_loss": 0.8690778, + "learning_rate": 0.0007576940780669712, + "loss": 0.88011932, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.18664551, + "step": 1808, + "time_per_iteration": 3.279489278793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123806, + "balance_loss_mlp": 1.10510182, + "epoch": 0.3480184686417853, + "flos": 773714944512.0, + "grad_norm": 0.07904928967380129, + "language_loss": 0.84151316, + "learning_rate": 0.0007574270497437624, + "loss": 0.85275126, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.18701172, + "step": 1809, + "time_per_iteration": 2.987900733947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122089, + "balance_loss_mlp": 1.10336101, + "epoch": 0.34821085032704885, + "flos": 576839812608.0, + "grad_norm": 0.06962767524782593, + "language_loss": 0.87729847, + "learning_rate": 0.000757159921478509, + "loss": 0.88851929, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.18725586, + "step": 1810, + "time_per_iteration": 2.8426477909088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055659, + "balance_loss_mlp": 1.04316616, + "epoch": 0.34840323201231244, + "flos": 1524947295744.0, + "grad_norm": 0.023331363727236345, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75506294, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.125, + "step": 1811, + "time_per_iteration": 4.784373044967651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146056, + "balance_loss_mlp": 1.12720931, + "epoch": 0.34859561369757597, + "flos": 509164102656.0, + "grad_norm": 0.0794635065049281, + "language_loss": 0.87678373, + "learning_rate": 0.0007566253655367423, + "loss": 0.88824427, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.18823242, + "step": 1812, + "time_per_iteration": 2.649627685546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151883, + "balance_loss_mlp": 1.13314283, + "epoch": 0.34878799538283956, + "flos": 548662616064.0, + "grad_norm": 0.08948054068367119, + "language_loss": 0.89612782, + "learning_rate": 0.000756357938067762, + "loss": 0.90764666, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.18737793, + "step": 1813, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151317, + "balance_loss_mlp": 1.13220787, + "epoch": 0.34898037706810314, + "flos": 983638536192.0, + "grad_norm": 0.08322597535257283, + "language_loss": 0.82610291, + "learning_rate": 0.0007560904110718033, + "loss": 0.83761609, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.19104004, + "step": 1814, + "time_per_iteration": 3.2898061275482178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124529, + "balance_loss_mlp": 1.10556281, + "epoch": 0.3491727587533667, + "flos": 681605217792.0, + "grad_norm": 0.08612147208900138, + "language_loss": 0.8345058, + "learning_rate": 0.0007558227846527297, + "loss": 0.84575117, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.1895752, + "step": 1815, + "time_per_iteration": 2.9130759239196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123683, + "balance_loss_mlp": 1.10491991, + "epoch": 0.34936514043863026, + "flos": 394026301440.0, + "grad_norm": 0.09988459790630169, + "language_loss": 0.83118773, + "learning_rate": 0.0007555550589144429, + "loss": 0.84242463, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.1875, + "step": 1816, + "time_per_iteration": 2.4752960205078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117728, + "balance_loss_mlp": 1.09804606, + "epoch": 0.3495575221238938, + "flos": 461363558400.0, + "grad_norm": 0.07751955343806295, + "language_loss": 0.84176993, + "learning_rate": 0.000755287233960883, + "loss": 0.85294718, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.19665527, + "step": 1817, + "time_per_iteration": 2.597585439682007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098926, + "balance_loss_mlp": 1.07926798, + "epoch": 0.3497499038091574, + "flos": 724172576256.0, + "grad_norm": 0.08165217026076037, + "language_loss": 0.7746554, + "learning_rate": 0.0007550193098960292, + "loss": 0.78564465, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.19641113, + "step": 1818, + "time_per_iteration": 2.9257001876831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092408, + "balance_loss_mlp": 1.07195151, + "epoch": 0.3499422854944209, + "flos": 827729528832.0, + "grad_norm": 0.0691698669989475, + "language_loss": 0.85927546, + "learning_rate": 0.0007547512868238988, + "loss": 0.87019956, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.20446777, + "step": 1819, + "time_per_iteration": 3.1347925662994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108081, + "balance_loss_mlp": 1.06050837, + "epoch": 0.3501346671796845, + "flos": 493479226368.0, + "grad_norm": 0.09514158419007644, + "language_loss": 0.83275855, + "learning_rate": 0.0007544831648485473, + "loss": 0.84356666, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.20300293, + "step": 1820, + "time_per_iteration": 2.7215232849121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108343, + "balance_loss_mlp": 1.06210327, + "epoch": 0.35032704886494803, + "flos": 578752335360.0, + "grad_norm": 0.1073780855917388, + "language_loss": 0.81151676, + "learning_rate": 0.0007542149440740694, + "loss": 0.82235104, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.21350098, + "step": 1821, + "time_per_iteration": 2.6931724548339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080905, + "balance_loss_mlp": 1.05936432, + "epoch": 0.3505194305502116, + "flos": 584672338944.0, + "grad_norm": 0.1562262811893555, + "language_loss": 0.85392433, + "learning_rate": 0.000753946624604597, + "loss": 0.86473334, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.2154541, + "step": 1822, + "time_per_iteration": 2.7700464725494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072173, + "balance_loss_mlp": 1.05028629, + "epoch": 0.3507118122354752, + "flos": 526958991360.0, + "grad_norm": 0.08427952696401207, + "language_loss": 0.87906677, + "learning_rate": 0.0007536782065443015, + "loss": 0.88978851, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.21899414, + "step": 1823, + "time_per_iteration": 2.618863105773926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084137, + "balance_loss_mlp": 1.06188059, + "epoch": 0.35090419392073874, + "flos": 511523735040.0, + "grad_norm": 0.15781529291863344, + "language_loss": 0.75435269, + "learning_rate": 0.0007534096899973919, + "loss": 0.76519406, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.22253418, + "step": 1824, + "time_per_iteration": 2.5891709327697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086563, + "balance_loss_mlp": 1.06396103, + "epoch": 0.3510965756060023, + "flos": 564021149184.0, + "grad_norm": 0.09040612359289192, + "language_loss": 0.82346433, + "learning_rate": 0.0007531410750681154, + "loss": 0.83432996, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.22595215, + "step": 1825, + "time_per_iteration": 2.810972213745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111588, + "balance_loss_mlp": 1.09455299, + "epoch": 0.35128895729126586, + "flos": 1020535137792.0, + "grad_norm": 0.07292466952983544, + "language_loss": 0.86399037, + "learning_rate": 0.0007528723618607575, + "loss": 0.87514913, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.21325684, + "step": 1826, + "time_per_iteration": 3.474869966506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133984, + "balance_loss_mlp": 1.11370611, + "epoch": 0.35148133897652944, + "flos": 588262445568.0, + "grad_norm": 0.08837862995453269, + "language_loss": 0.82404733, + "learning_rate": 0.0007526035504796422, + "loss": 0.83538717, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.20275879, + "step": 1827, + "time_per_iteration": 2.8155739307403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150633, + "balance_loss_mlp": 1.13051069, + "epoch": 0.351673720661793, + "flos": 495300344832.0, + "grad_norm": 0.10569988158542801, + "language_loss": 0.86735702, + "learning_rate": 0.0007523346410291312, + "loss": 0.87886333, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.20117188, + "step": 1828, + "time_per_iteration": 2.788748025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147917, + "balance_loss_mlp": 1.12691236, + "epoch": 0.35186610234705656, + "flos": 762670411776.0, + "grad_norm": 0.11718864183712574, + "language_loss": 0.84880495, + "learning_rate": 0.0007520656336136245, + "loss": 0.86028415, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.21020508, + "step": 1829, + "time_per_iteration": 2.995258331298828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144006, + "balance_loss_mlp": 1.12407422, + "epoch": 0.3520584840323201, + "flos": 626135132160.0, + "grad_norm": 0.07752679685559628, + "language_loss": 0.87776285, + "learning_rate": 0.0007517965283375599, + "loss": 0.88920295, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.19921875, + "step": 1830, + "time_per_iteration": 2.9131507873535156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137113, + "balance_loss_mlp": 1.11694324, + "epoch": 0.3522508657175837, + "flos": 537388286976.0, + "grad_norm": 0.0712879308552529, + "language_loss": 0.89257503, + "learning_rate": 0.0007515273253054132, + "loss": 0.90394616, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.20166016, + "step": 1831, + "time_per_iteration": 2.7115964889526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144172, + "balance_loss_mlp": 1.12451458, + "epoch": 0.35244324740284727, + "flos": 567384030720.0, + "grad_norm": 0.08358912815272257, + "language_loss": 0.82353687, + "learning_rate": 0.0007512580246216988, + "loss": 0.83497858, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.19665527, + "step": 1832, + "time_per_iteration": 2.7660555839538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137261, + "balance_loss_mlp": 1.11740053, + "epoch": 0.3526356290881108, + "flos": 513058157568.0, + "grad_norm": 0.08932198209233742, + "language_loss": 0.84907162, + "learning_rate": 0.000750988626390968, + "loss": 0.86044419, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.19848633, + "step": 1833, + "time_per_iteration": 2.6142635345458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135258, + "balance_loss_mlp": 1.11577928, + "epoch": 0.3528280107733744, + "flos": 595791023616.0, + "grad_norm": 0.0712191508208571, + "language_loss": 0.84978765, + "learning_rate": 0.0007507191307178108, + "loss": 0.86114025, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.19470215, + "step": 1834, + "time_per_iteration": 2.8424935340881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124766, + "balance_loss_mlp": 1.10512066, + "epoch": 0.3530203924586379, + "flos": 551234792448.0, + "grad_norm": 0.12990441969076433, + "language_loss": 0.74422562, + "learning_rate": 0.0007504495377068543, + "loss": 0.75547332, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.19628906, + "step": 1835, + "time_per_iteration": 2.8079066276550293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129638, + "balance_loss_mlp": 1.11026645, + "epoch": 0.3532127741439015, + "flos": 652990450176.0, + "grad_norm": 0.09183665723882013, + "language_loss": 0.81276792, + "learning_rate": 0.0007501798474627642, + "loss": 0.82406431, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.19360352, + "step": 1836, + "time_per_iteration": 2.952760934829712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120111, + "balance_loss_mlp": 1.10109687, + "epoch": 0.35340515582916504, + "flos": 722791226880.0, + "grad_norm": 0.11181895830758388, + "language_loss": 0.83497429, + "learning_rate": 0.0007499100600902433, + "loss": 0.84617543, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.18994141, + "step": 1837, + "time_per_iteration": 3.0599989891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112032, + "balance_loss_mlp": 1.09237409, + "epoch": 0.35359753751442863, + "flos": 594894233088.0, + "grad_norm": 0.08618493176537427, + "language_loss": 0.84243816, + "learning_rate": 0.0007496401756940324, + "loss": 0.85355854, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.1965332, + "step": 1838, + "time_per_iteration": 2.7366483211517334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111194, + "balance_loss_mlp": 1.09217548, + "epoch": 0.3537899191996922, + "flos": 632668174848.0, + "grad_norm": 0.1107744559232423, + "language_loss": 0.82783937, + "learning_rate": 0.0007493701943789098, + "loss": 0.8389588, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.19750977, + "step": 1839, + "time_per_iteration": 2.780212640762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.08844888, + "epoch": 0.35398230088495575, + "flos": 506364701184.0, + "grad_norm": 0.07955024359155173, + "language_loss": 0.82622725, + "learning_rate": 0.000749100116249692, + "loss": 0.83730406, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.19213867, + "step": 1840, + "time_per_iteration": 2.59558367729187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110996, + "balance_loss_mlp": 1.09009957, + "epoch": 0.35417468257021933, + "flos": 508034944512.0, + "grad_norm": 0.09363875008830587, + "language_loss": 0.86041892, + "learning_rate": 0.0007488299414112321, + "loss": 0.87151849, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.19848633, + "step": 1841, + "time_per_iteration": 2.625204563140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112719, + "balance_loss_mlp": 1.0932045, + "epoch": 0.35436706425548287, + "flos": 656437395456.0, + "grad_norm": 0.07784236461393054, + "language_loss": 0.77495539, + "learning_rate": 0.0007485596699684215, + "loss": 0.78608257, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.19506836, + "step": 1842, + "time_per_iteration": 2.889179229736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110913, + "balance_loss_mlp": 1.0890193, + "epoch": 0.35455944594074645, + "flos": 652634744832.0, + "grad_norm": 0.0730470956511186, + "language_loss": 0.85287404, + "learning_rate": 0.000748289302026189, + "loss": 0.86396539, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.2010498, + "step": 1843, + "time_per_iteration": 2.8508758544921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117167, + "balance_loss_mlp": 1.09693718, + "epoch": 0.35475182762601, + "flos": 848593262592.0, + "grad_norm": 0.08361202953284802, + "language_loss": 0.85558116, + "learning_rate": 0.0007480188376895004, + "loss": 0.8667528, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.20227051, + "step": 1844, + "time_per_iteration": 3.0799713134765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058665, + "balance_loss_mlp": 1.04655302, + "epoch": 0.3549442093112736, + "flos": 1521468043776.0, + "grad_norm": 0.036648944322370085, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.74870002, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.12109375, + "step": 1845, + "time_per_iteration": 4.911001205444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151869, + "balance_loss_mlp": 1.1320442, + "epoch": 0.3551365909965371, + "flos": 651411611136.0, + "grad_norm": 0.08485938300722028, + "language_loss": 0.78214371, + "learning_rate": 0.0007474776202528074, + "loss": 0.79366243, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.19824219, + "step": 1846, + "time_per_iteration": 3.0216140747070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161099, + "balance_loss_mlp": 1.1411432, + "epoch": 0.3553289726818007, + "flos": 897458724864.0, + "grad_norm": 0.08254469932015057, + "language_loss": 0.81304067, + "learning_rate": 0.000747206867362922, + "loss": 0.82465172, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.19946289, + "step": 1847, + "time_per_iteration": 3.090902090072632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160243, + "balance_loss_mlp": 1.13996506, + "epoch": 0.3555213543670643, + "flos": 688491394560.0, + "grad_norm": 0.07042821685917994, + "language_loss": 0.83881712, + "learning_rate": 0.0007469360184988194, + "loss": 0.85041958, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.20275879, + "step": 1848, + "time_per_iteration": 2.834099292755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164664, + "balance_loss_mlp": 1.14419615, + "epoch": 0.3557137360523278, + "flos": 538564432896.0, + "grad_norm": 0.08278620993607219, + "language_loss": 0.86537004, + "learning_rate": 0.0007466650737656518, + "loss": 0.87701666, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.20471191, + "step": 1849, + "time_per_iteration": 2.6372272968292236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164893, + "balance_loss_mlp": 1.14411473, + "epoch": 0.3559061177375914, + "flos": 402261520896.0, + "grad_norm": 0.1003606576453008, + "language_loss": 0.90052241, + "learning_rate": 0.0007463940332686098, + "loss": 0.9121713, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.20788574, + "step": 1850, + "time_per_iteration": 2.485778331756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138299, + "balance_loss_mlp": 1.11759257, + "epoch": 0.35609849942285493, + "flos": 696568398336.0, + "grad_norm": 0.07662996022318802, + "language_loss": 0.83963442, + "learning_rate": 0.0007461228971129205, + "loss": 0.85101742, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.20715332, + "step": 1851, + "time_per_iteration": 2.9709644317626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119617, + "balance_loss_mlp": 1.09905326, + "epoch": 0.3562908811081185, + "flos": 568928365056.0, + "grad_norm": 0.09722709387095821, + "language_loss": 0.8525731, + "learning_rate": 0.0007458516654038483, + "loss": 0.86376923, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.20568848, + "step": 1852, + "time_per_iteration": 2.678692579269409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122543, + "balance_loss_mlp": 1.10156226, + "epoch": 0.35648326279338205, + "flos": 682386011136.0, + "grad_norm": 0.11064851070237179, + "language_loss": 0.86565018, + "learning_rate": 0.0007455803382466946, + "loss": 0.87687564, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.20983887, + "step": 1853, + "time_per_iteration": 2.8357412815093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118205, + "balance_loss_mlp": 1.0977726, + "epoch": 0.35667564447864564, + "flos": 629139737088.0, + "grad_norm": 0.07486516106338226, + "language_loss": 0.87089902, + "learning_rate": 0.0007453089157467979, + "loss": 0.88208103, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.2043457, + "step": 1854, + "time_per_iteration": 2.808497667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110339, + "balance_loss_mlp": 1.08300531, + "epoch": 0.35686802616390917, + "flos": 814048579584.0, + "grad_norm": 0.0938349401282225, + "language_loss": 0.82008994, + "learning_rate": 0.0007450373980095341, + "loss": 0.83112389, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.20385742, + "step": 1855, + "time_per_iteration": 3.127755641937256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102136, + "balance_loss_mlp": 1.08226347, + "epoch": 0.35706040784917276, + "flos": 526178198016.0, + "grad_norm": 0.07357008991516471, + "language_loss": 0.86741251, + "learning_rate": 0.0007447657851403155, + "loss": 0.87843382, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.1986084, + "step": 1856, + "time_per_iteration": 2.662548780441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104026, + "balance_loss_mlp": 1.08421302, + "epoch": 0.35725278953443634, + "flos": 511970844672.0, + "grad_norm": 0.09605793543255373, + "language_loss": 0.78325486, + "learning_rate": 0.0007444940772445915, + "loss": 0.79429507, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.19812012, + "step": 1857, + "time_per_iteration": 2.7455575466156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098079, + "balance_loss_mlp": 1.07937515, + "epoch": 0.3574451712196999, + "flos": 487428171264.0, + "grad_norm": 0.09380435326028273, + "language_loss": 0.80025625, + "learning_rate": 0.0007442222744278484, + "loss": 0.81123704, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.18688965, + "step": 1858, + "time_per_iteration": 2.7159781455993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110587, + "balance_loss_mlp": 1.08752322, + "epoch": 0.35763755290496346, + "flos": 550671312384.0, + "grad_norm": 0.07197173632554923, + "language_loss": 0.8371805, + "learning_rate": 0.0007439503767956099, + "loss": 0.84823918, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.18347168, + "step": 1859, + "time_per_iteration": 2.7746405601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129586, + "balance_loss_mlp": 1.11757004, + "epoch": 0.357829934590227, + "flos": 1504083561984.0, + "grad_norm": 0.053548748661834844, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80801189, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12011719, + "step": 1860, + "time_per_iteration": 4.952972412109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141933, + "balance_loss_mlp": 1.12300301, + "epoch": 0.3580223162754906, + "flos": 568695997440.0, + "grad_norm": 0.07146029040980974, + "language_loss": 0.86061597, + "learning_rate": 0.000743406297506922, + "loss": 0.87203526, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.18920898, + "step": 1861, + "time_per_iteration": 2.788799285888672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155472, + "balance_loss_mlp": 1.13686371, + "epoch": 0.3582146979607541, + "flos": 626473585152.0, + "grad_norm": 0.08496046226468609, + "language_loss": 0.83806807, + "learning_rate": 0.0007431341160617031, + "loss": 0.84962279, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.18615723, + "step": 1862, + "time_per_iteration": 2.891972780227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153957, + "balance_loss_mlp": 1.13561106, + "epoch": 0.3584070796460177, + "flos": 507271403520.0, + "grad_norm": 0.08024798355603865, + "language_loss": 0.87945759, + "learning_rate": 0.0007428618402234491, + "loss": 0.89099711, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.18347168, + "step": 1863, + "time_per_iteration": 2.6548287868499756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157244, + "balance_loss_mlp": 1.13868272, + "epoch": 0.3585994613312813, + "flos": 606479851008.0, + "grad_norm": 0.10629387801358743, + "language_loss": 0.79862851, + "learning_rate": 0.0007425894700978668, + "loss": 0.81020093, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.18579102, + "step": 1864, + "time_per_iteration": 2.80774188041687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153191, + "balance_loss_mlp": 1.13476086, + "epoch": 0.3587918430165448, + "flos": 1412886799872.0, + "grad_norm": 0.07530240473897643, + "language_loss": 0.79704821, + "learning_rate": 0.0007423170057906996, + "loss": 0.80858016, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.1842041, + "step": 1865, + "time_per_iteration": 3.8680994510650635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145718, + "balance_loss_mlp": 1.12701416, + "epoch": 0.3589842247018084, + "flos": 478553121792.0, + "grad_norm": 0.09184761749378255, + "language_loss": 0.86028153, + "learning_rate": 0.0007420444474077275, + "loss": 0.87173867, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.18688965, + "step": 1866, + "time_per_iteration": 2.5685620307922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113899, + "balance_loss_mlp": 1.12003553, + "epoch": 0.35917660638707194, + "flos": 504711710208.0, + "grad_norm": 0.09893409220555562, + "language_loss": 0.89461643, + "learning_rate": 0.0007417717950547671, + "loss": 0.90600634, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.18945312, + "step": 1867, + "time_per_iteration": 2.671124219894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107889, + "balance_loss_mlp": 1.06611049, + "epoch": 0.3593689880723355, + "flos": 1492129382400.0, + "grad_norm": 0.038408778239575524, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77075499, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.12792969, + "step": 1868, + "time_per_iteration": 4.9185333251953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122572, + "balance_loss_mlp": 1.10416651, + "epoch": 0.35956136975759906, + "flos": 528629234688.0, + "grad_norm": 0.07553494616843248, + "language_loss": 0.84798276, + "learning_rate": 0.0007412262088623299, + "loss": 0.85920852, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.18408203, + "step": 1869, + "time_per_iteration": 2.7392468452453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120202, + "balance_loss_mlp": 1.10186732, + "epoch": 0.35975375144286265, + "flos": 534917426688.0, + "grad_norm": 0.08536155576366684, + "language_loss": 0.79418659, + "learning_rate": 0.0007409532752346684, + "loss": 0.80538857, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.18334961, + "step": 1870, + "time_per_iteration": 2.696479082107544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119342, + "balance_loss_mlp": 1.10078073, + "epoch": 0.3599461331281262, + "flos": 504941506560.0, + "grad_norm": 0.06482127106924716, + "language_loss": 0.88322479, + "learning_rate": 0.0007406802480606491, + "loss": 0.89441818, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.18566895, + "step": 1871, + "time_per_iteration": 2.636009931564331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125376, + "balance_loss_mlp": 1.1068871, + "epoch": 0.36013851481338977, + "flos": 511533646848.0, + "grad_norm": 0.08328980109467413, + "language_loss": 0.90382409, + "learning_rate": 0.0007404071274462707, + "loss": 0.91507781, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.18493652, + "step": 1872, + "time_per_iteration": 2.6033034324645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126247, + "balance_loss_mlp": 1.10767388, + "epoch": 0.36033089649865335, + "flos": 547590357504.0, + "grad_norm": 0.08507135616363887, + "language_loss": 0.83713084, + "learning_rate": 0.0007401339134975682, + "loss": 0.84839332, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.18579102, + "step": 1873, + "time_per_iteration": 2.6768579483032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124337, + "balance_loss_mlp": 1.1061461, + "epoch": 0.3605232781839169, + "flos": 458655561216.0, + "grad_norm": 0.08710024588150622, + "language_loss": 0.8447001, + "learning_rate": 0.0007398606063206122, + "loss": 0.8559435, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.18200684, + "step": 1874, + "time_per_iteration": 2.6102805137634277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118797, + "balance_loss_mlp": 1.1010226, + "epoch": 0.36071565986918047, + "flos": 509559455232.0, + "grad_norm": 0.09331990326127676, + "language_loss": 0.78271621, + "learning_rate": 0.0007395872060215101, + "loss": 0.79390419, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.17773438, + "step": 1875, + "time_per_iteration": 2.6235439777374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125165, + "balance_loss_mlp": 1.10746276, + "epoch": 0.360908041554444, + "flos": 559195799040.0, + "grad_norm": 0.08705098996186143, + "language_loss": 0.8794744, + "learning_rate": 0.0007393137127064056, + "loss": 0.89072609, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.17724609, + "step": 1876, + "time_per_iteration": 2.693005323410034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131992, + "balance_loss_mlp": 1.11434913, + "epoch": 0.3611004232397076, + "flos": 523845729792.0, + "grad_norm": 0.07970542462566557, + "language_loss": 0.84223264, + "learning_rate": 0.0007390401264814779, + "loss": 0.85355258, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.17675781, + "step": 1877, + "time_per_iteration": 2.6267154216766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144153, + "balance_loss_mlp": 1.12600899, + "epoch": 0.3612928049249711, + "flos": 540988305408.0, + "grad_norm": 0.11052243492945069, + "language_loss": 0.84164327, + "learning_rate": 0.0007387664474529427, + "loss": 0.8530848, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.18151855, + "step": 1878, + "time_per_iteration": 2.6380414962768555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114366, + "balance_loss_mlp": 1.12561202, + "epoch": 0.3614851866102347, + "flos": 552556670976.0, + "grad_norm": 0.06785614970382317, + "language_loss": 0.91167343, + "learning_rate": 0.0007384926757270518, + "loss": 0.92311001, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.18054199, + "step": 1879, + "time_per_iteration": 2.6760640144348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148828, + "balance_loss_mlp": 1.13057721, + "epoch": 0.36167756829549824, + "flos": 772071865344.0, + "grad_norm": 0.07379174248702317, + "language_loss": 0.79513329, + "learning_rate": 0.0007382188114100924, + "loss": 0.80662155, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.18249512, + "step": 1880, + "time_per_iteration": 2.980865716934204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140419, + "balance_loss_mlp": 1.12196517, + "epoch": 0.36186994998076183, + "flos": 711885086208.0, + "grad_norm": 0.08452869991753884, + "language_loss": 0.81477511, + "learning_rate": 0.0007379448546083884, + "loss": 0.82617927, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.18457031, + "step": 1881, + "time_per_iteration": 2.9168553352355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122618, + "balance_loss_mlp": 1.10411692, + "epoch": 0.3620623316660254, + "flos": 747546444288.0, + "grad_norm": 0.07446388495521607, + "language_loss": 0.87973779, + "learning_rate": 0.0007376708054282992, + "loss": 0.89096403, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.18481445, + "step": 1882, + "time_per_iteration": 2.987179756164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115299, + "balance_loss_mlp": 1.09675002, + "epoch": 0.36225471335128895, + "flos": 482555833344.0, + "grad_norm": 0.06334344400813875, + "language_loss": 0.83726645, + "learning_rate": 0.0007373966639762201, + "loss": 0.84841949, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.18530273, + "step": 1883, + "time_per_iteration": 2.611685276031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107737, + "balance_loss_mlp": 1.08896196, + "epoch": 0.36244709503655254, + "flos": 506905786368.0, + "grad_norm": 0.34913247510054485, + "language_loss": 0.88361132, + "learning_rate": 0.0007371224303585822, + "loss": 0.89468867, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.18762207, + "step": 1884, + "time_per_iteration": 2.5775835514068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_mlp": 1.04219282, + "epoch": 0.36263947672181607, + "flos": 1394050466304.0, + "grad_norm": 0.031056792089232132, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81412423, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.13183594, + "step": 1885, + "time_per_iteration": 4.700505256652832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125656, + "balance_loss_mlp": 1.10721421, + "epoch": 0.36283185840707965, + "flos": 653296969728.0, + "grad_norm": 0.08679320645386224, + "language_loss": 0.82572937, + "learning_rate": 0.0007365736870525335, + "loss": 0.83698595, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.18457031, + "step": 1886, + "time_per_iteration": 2.859740734100342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129292, + "balance_loss_mlp": 1.11139846, + "epoch": 0.3630242400923432, + "flos": 488863848960.0, + "grad_norm": 0.08795223769340633, + "language_loss": 0.82107997, + "learning_rate": 0.000736299177577164, + "loss": 0.8323729, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.17907715, + "step": 1887, + "time_per_iteration": 2.5841786861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130014, + "balance_loss_mlp": 1.11198997, + "epoch": 0.3632166217776068, + "flos": 517159613952.0, + "grad_norm": 0.08315005772253937, + "language_loss": 0.83388066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84518075, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.18029785, + "step": 1888, + "time_per_iteration": 2.665529489517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145189, + "balance_loss_mlp": 1.12729573, + "epoch": 0.36340900346287036, + "flos": 646173656064.0, + "grad_norm": 0.088670630002398, + "language_loss": 0.89456129, + "learning_rate": 0.0007357498835146039, + "loss": 0.90601313, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.17895508, + "step": 1889, + "time_per_iteration": 2.8847129344940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156911, + "balance_loss_mlp": 1.13911295, + "epoch": 0.3636013851481339, + "flos": 553327552512.0, + "grad_norm": 0.10357296063524607, + "language_loss": 0.87070376, + "learning_rate": 0.0007354750991406684, + "loss": 0.8822729, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.17810059, + "step": 1890, + "time_per_iteration": 2.723062753677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159624, + "balance_loss_mlp": 1.14133692, + "epoch": 0.3637937668333975, + "flos": 546653919744.0, + "grad_norm": 0.08144896750451855, + "language_loss": 0.80397975, + "learning_rate": 0.0007352002233471919, + "loss": 0.81557596, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.18310547, + "step": 1891, + "time_per_iteration": 2.6574442386627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175761, + "balance_loss_mlp": 1.15818954, + "epoch": 0.363986148518661, + "flos": 538112180736.0, + "grad_norm": 0.12092525276023756, + "language_loss": 0.79267627, + "learning_rate": 0.0007349252562408906, + "loss": 0.80443388, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.17590332, + "step": 1892, + "time_per_iteration": 2.7125816345214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180182, + "balance_loss_mlp": 1.16231263, + "epoch": 0.3641785302039246, + "flos": 660217651200.0, + "grad_norm": 0.10164191197483487, + "language_loss": 0.81473255, + "learning_rate": 0.0007346501979285158, + "loss": 0.82653439, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.17883301, + "step": 1893, + "time_per_iteration": 2.902371406555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069378, + "balance_loss_mlp": 1.05621696, + "epoch": 0.36437091188918813, + "flos": 1468743031296.0, + "grad_norm": 0.029928407037273664, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.8160848, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.13183594, + "step": 1894, + "time_per_iteration": 4.841979265213013 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166903, + "balance_loss_mlp": 1.14858055, + "epoch": 0.3645632935744517, + "flos": 597298281984.0, + "grad_norm": 0.079124644393009, + "language_loss": 0.85946983, + "learning_rate": 0.0007340998081127308, + "loss": 0.87113881, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.18322754, + "step": 1895, + "time_per_iteration": 2.7981679439544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149308, + "balance_loss_mlp": 1.13090205, + "epoch": 0.36475567525971525, + "flos": 599509610496.0, + "grad_norm": 0.08117131709807607, + "language_loss": 0.90645039, + "learning_rate": 0.0007338244768230007, + "loss": 0.91794348, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.18408203, + "step": 1896, + "time_per_iteration": 2.821958541870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131855, + "balance_loss_mlp": 1.11337733, + "epoch": 0.36494805694497884, + "flos": 798403350528.0, + "grad_norm": 0.06648659114179455, + "language_loss": 0.88624144, + "learning_rate": 0.0007335490547545578, + "loss": 0.89756, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.18469238, + "step": 1897, + "time_per_iteration": 3.0718753337860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115822, + "balance_loss_mlp": 1.09670115, + "epoch": 0.3651404386302424, + "flos": 637313287680.0, + "grad_norm": 0.06783762736794967, + "language_loss": 0.82265627, + "learning_rate": 0.0007332735420143308, + "loss": 0.8338145, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.19091797, + "step": 1898, + "time_per_iteration": 2.7864439487457275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103004, + "balance_loss_mlp": 1.08431149, + "epoch": 0.36533282031550596, + "flos": 491581757952.0, + "grad_norm": 0.10561663647405507, + "language_loss": 0.86410689, + "learning_rate": 0.0007329979387092826, + "loss": 0.87513697, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.18664551, + "step": 1899, + "time_per_iteration": 2.6032557487487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099553, + "balance_loss_mlp": 1.08087325, + "epoch": 0.36552520200076954, + "flos": 855970965504.0, + "grad_norm": 0.0619875823145499, + "language_loss": 0.83878422, + "learning_rate": 0.0007327222449464124, + "loss": 0.84977973, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.18676758, + "step": 1900, + "time_per_iteration": 3.2741036415100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103796, + "balance_loss_mlp": 1.08450782, + "epoch": 0.3657175836860331, + "flos": 483702243840.0, + "grad_norm": 0.07856096432694096, + "language_loss": 0.885158, + "learning_rate": 0.0007324464608327538, + "loss": 0.89619601, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.19287109, + "step": 1901, + "time_per_iteration": 2.678788900375366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094923, + "balance_loss_mlp": 1.07613552, + "epoch": 0.36590996537129666, + "flos": 434792365056.0, + "grad_norm": 0.117877128585243, + "language_loss": 0.88101745, + "learning_rate": 0.0007321705864753758, + "loss": 0.8919667, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.18774414, + "step": 1902, + "time_per_iteration": 2.746980905532837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104989, + "balance_loss_mlp": 1.08645177, + "epoch": 0.3661023470565602, + "flos": 712206286848.0, + "grad_norm": 0.07495737234387592, + "language_loss": 0.83840346, + "learning_rate": 0.0007318946219813823, + "loss": 0.84945333, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.18530273, + "step": 1903, + "time_per_iteration": 3.0181055068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113516, + "balance_loss_mlp": 1.09416842, + "epoch": 0.3662947287418238, + "flos": 564760097280.0, + "grad_norm": 0.08147269799104237, + "language_loss": 0.89553183, + "learning_rate": 0.000731618567457912, + "loss": 0.90666699, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.19335938, + "step": 1904, + "time_per_iteration": 2.656008243560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112169, + "balance_loss_mlp": 1.10242581, + "epoch": 0.3664871104270873, + "flos": 789752954880.0, + "grad_norm": 0.09666599698156476, + "language_loss": 0.86684108, + "learning_rate": 0.000731342423012139, + "loss": 0.87805796, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.19250488, + "step": 1905, + "time_per_iteration": 3.0675060749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130977, + "balance_loss_mlp": 1.11136723, + "epoch": 0.3666794921123509, + "flos": 752557174272.0, + "grad_norm": 0.07693711099894461, + "language_loss": 0.82752407, + "learning_rate": 0.0007310661887512722, + "loss": 0.83883387, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.19616699, + "step": 1906, + "time_per_iteration": 3.058940887451172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121537, + "balance_loss_mlp": 1.10290504, + "epoch": 0.3668718737976145, + "flos": 523531869696.0, + "grad_norm": 0.08447106182036945, + "language_loss": 0.8153969, + "learning_rate": 0.0007307898647825549, + "loss": 0.82661223, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.1862793, + "step": 1907, + "time_per_iteration": 2.6844449043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123702, + "balance_loss_mlp": 1.10468769, + "epoch": 0.367064255482878, + "flos": 571967474688.0, + "grad_norm": 0.09351646457276126, + "language_loss": 0.89255947, + "learning_rate": 0.0007305134512132659, + "loss": 0.90379649, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.18994141, + "step": 1908, + "time_per_iteration": 2.709672451019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110565, + "balance_loss_mlp": 1.09136009, + "epoch": 0.3672566371681416, + "flos": 447114359808.0, + "grad_norm": 0.10593037141853442, + "language_loss": 0.82889271, + "learning_rate": 0.0007302369481507183, + "loss": 0.83999836, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.19189453, + "step": 1909, + "time_per_iteration": 2.521117687225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042583, + "balance_loss_mlp": 1.03214002, + "epoch": 0.36744901885340514, + "flos": 1540090713600.0, + "grad_norm": 0.025696927495133286, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81004339, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10449219, + "step": 1910, + "time_per_iteration": 4.8944993019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109603, + "balance_loss_mlp": 1.09143519, + "epoch": 0.36764140053866873, + "flos": 563685267456.0, + "grad_norm": 0.13197556892024634, + "language_loss": 0.85332, + "learning_rate": 0.000729683673975274, + "loss": 0.864416, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.18164062, + "step": 1911, + "time_per_iteration": 2.6855151653289795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113177, + "balance_loss_mlp": 1.09509254, + "epoch": 0.36783378222393226, + "flos": 1216663981056.0, + "grad_norm": 0.05917682500902713, + "language_loss": 0.82910979, + "learning_rate": 0.0007294069030771774, + "loss": 0.84024155, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.1809082, + "step": 1912, + "time_per_iteration": 3.696908712387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119134, + "balance_loss_mlp": 1.10070467, + "epoch": 0.36802616390919585, + "flos": 498724895232.0, + "grad_norm": 0.2785371066278341, + "language_loss": 0.90901196, + "learning_rate": 0.0007291300431154224, + "loss": 0.92020327, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.18432617, + "step": 1913, + "time_per_iteration": 2.666469097137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066964, + "balance_loss_mlp": 1.05699825, + "epoch": 0.36821854559445943, + "flos": 1582146349056.0, + "grad_norm": 0.035296075115353785, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71456701, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.09960938, + "step": 1914, + "time_per_iteration": 5.019417762756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176446, + "balance_loss_mlp": 1.1579566, + "epoch": 0.36841092727972297, + "flos": 835626295296.0, + "grad_norm": 0.09302167105112862, + "language_loss": 0.79388487, + "learning_rate": 0.0007285760564309179, + "loss": 0.80564928, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.18493652, + "step": 1915, + "time_per_iteration": 3.112898826599121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01204501, + "balance_loss_mlp": 1.18492651, + "epoch": 0.36860330896498655, + "flos": 689855118336.0, + "grad_norm": 0.10352341742670183, + "language_loss": 0.84420514, + "learning_rate": 0.0007282989299232448, + "loss": 0.85625011, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.19567871, + "step": 1916, + "time_per_iteration": 3.0435094833374023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01222721, + "balance_loss_mlp": 1.20364785, + "epoch": 0.3687956906502501, + "flos": 554182497792.0, + "grad_norm": 0.07568711881104075, + "language_loss": 0.83658814, + "learning_rate": 0.0007280217147820668, + "loss": 0.84881544, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.19042969, + "step": 1917, + "time_per_iteration": 2.618802547454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214339, + "balance_loss_mlp": 1.19502735, + "epoch": 0.3689880723355137, + "flos": 576703991808.0, + "grad_norm": 0.06430089788192027, + "language_loss": 0.78882575, + "learning_rate": 0.0007277444111150079, + "loss": 0.80096912, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.19299316, + "step": 1918, + "time_per_iteration": 2.705514669418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212887, + "balance_loss_mlp": 1.19302678, + "epoch": 0.3691804540207772, + "flos": 528868942848.0, + "grad_norm": 0.1316988542142886, + "language_loss": 0.84107184, + "learning_rate": 0.0007274670190297272, + "loss": 0.85320067, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.19848633, + "step": 1919, + "time_per_iteration": 2.643360137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216839, + "balance_loss_mlp": 1.19697857, + "epoch": 0.3693728357060408, + "flos": 561019115520.0, + "grad_norm": 0.08424482176176182, + "language_loss": 0.82129955, + "learning_rate": 0.0007271895386339179, + "loss": 0.83346796, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.19848633, + "step": 1920, + "time_per_iteration": 2.7766342163085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01209662, + "balance_loss_mlp": 1.1898967, + "epoch": 0.3695652173913043, + "flos": 579770265600.0, + "grad_norm": 0.08336147686301533, + "language_loss": 0.83142531, + "learning_rate": 0.0007269119700353073, + "loss": 0.84352195, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.19763184, + "step": 1921, + "time_per_iteration": 2.747455596923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217254, + "balance_loss_mlp": 1.19840705, + "epoch": 0.3697575990765679, + "flos": 512914622976.0, + "grad_norm": 0.06910916264284567, + "language_loss": 0.85129571, + "learning_rate": 0.0007266343133416571, + "loss": 0.86346817, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.18811035, + "step": 1922, + "time_per_iteration": 2.815875768661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107275, + "balance_loss_mlp": 1.09573579, + "epoch": 0.3699499807618315, + "flos": 1570640025600.0, + "grad_norm": 0.04105564932095409, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78224194, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11523438, + "step": 1923, + "time_per_iteration": 4.86853289604187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01198125, + "balance_loss_mlp": 1.17899168, + "epoch": 0.37014236244709503, + "flos": 497338776576.0, + "grad_norm": 0.1110881339245658, + "language_loss": 0.84574348, + "learning_rate": 0.0007260787361004556, + "loss": 0.85772473, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.19128418, + "step": 1924, + "time_per_iteration": 2.580287456512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060779, + "balance_loss_mlp": 1.0494777, + "epoch": 0.3703347441323586, + "flos": 1444368485376.0, + "grad_norm": 0.023148070033358246, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74822283, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11279297, + "step": 1925, + "time_per_iteration": 4.9416913986206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175905, + "balance_loss_mlp": 1.15692663, + "epoch": 0.37052712581762215, + "flos": 563601203712.0, + "grad_norm": 0.06834955035904498, + "language_loss": 0.87516356, + "learning_rate": 0.0007255228077730903, + "loss": 0.8869226, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.18969727, + "step": 1926, + "time_per_iteration": 2.7211105823516846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176426, + "balance_loss_mlp": 1.15784156, + "epoch": 0.37071950750288574, + "flos": 926078261760.0, + "grad_norm": 0.06265343241116231, + "language_loss": 0.81563449, + "learning_rate": 0.0007252447122218632, + "loss": 0.82739878, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.18579102, + "step": 1927, + "time_per_iteration": 3.151231527328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172974, + "balance_loss_mlp": 1.15472341, + "epoch": 0.37091188918814927, + "flos": 418312014336.0, + "grad_norm": 0.09894828359622332, + "language_loss": 0.88063776, + "learning_rate": 0.0007249665292228834, + "loss": 0.89236754, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.18261719, + "step": 1928, + "time_per_iteration": 2.702021360397339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173693, + "balance_loss_mlp": 1.1554302, + "epoch": 0.37110427087341286, + "flos": 463182105600.0, + "grad_norm": 0.08781668530165682, + "language_loss": 0.83526367, + "learning_rate": 0.000724688258884151, + "loss": 0.8470006, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.18249512, + "step": 1929, + "time_per_iteration": 2.560795783996582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162512, + "balance_loss_mlp": 1.14461839, + "epoch": 0.3712966525586764, + "flos": 849658180608.0, + "grad_norm": 0.07372740974795068, + "language_loss": 0.86387187, + "learning_rate": 0.0007244099013137002, + "loss": 0.87549698, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.17907715, + "step": 1930, + "time_per_iteration": 3.090304374694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153317, + "balance_loss_mlp": 1.1359247, + "epoch": 0.37148903424394, + "flos": 925954550784.0, + "grad_norm": 0.07369885077257772, + "language_loss": 0.88680494, + "learning_rate": 0.0007241314566195993, + "loss": 0.89833808, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.17407227, + "step": 1931, + "time_per_iteration": 3.2688889503479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140069, + "balance_loss_mlp": 1.12190151, + "epoch": 0.37168141592920356, + "flos": 519815854080.0, + "grad_norm": 0.1370251830388882, + "language_loss": 0.85430074, + "learning_rate": 0.0007238529249099496, + "loss": 0.86570138, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.18164062, + "step": 1932, + "time_per_iteration": 2.6766042709350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056936, + "balance_loss_mlp": 1.04673159, + "epoch": 0.3718737976144671, + "flos": 1445895567360.0, + "grad_norm": 0.03186229248255652, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.78913808, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.10205078, + "step": 1933, + "time_per_iteration": 4.938454866409302 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121071, + "balance_loss_mlp": 1.10291553, + "epoch": 0.3720661792997307, + "flos": 759564490752.0, + "grad_norm": 0.0858411932854742, + "language_loss": 0.80716681, + "learning_rate": 0.000723295600876581, + "loss": 0.81837749, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.1817627, + "step": 1934, + "time_per_iteration": 3.02756404876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127839, + "balance_loss_mlp": 1.10930252, + "epoch": 0.3722585609849942, + "flos": 516956981760.0, + "grad_norm": 0.07598025600252532, + "language_loss": 0.87578201, + "learning_rate": 0.0007230168087692344, + "loss": 0.8870604, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.18530273, + "step": 1935, + "time_per_iteration": 2.6842763423919678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117422, + "balance_loss_mlp": 1.09867072, + "epoch": 0.3724509426702578, + "flos": 782464084992.0, + "grad_norm": 0.07638826910824403, + "language_loss": 0.82760978, + "learning_rate": 0.0007227379300790839, + "loss": 0.83878398, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.1875, + "step": 1936, + "time_per_iteration": 3.028691530227661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126722, + "balance_loss_mlp": 1.10711217, + "epoch": 0.37264332435552133, + "flos": 391720997376.0, + "grad_norm": 0.1377793635442251, + "language_loss": 0.85613376, + "learning_rate": 0.0007224589649143997, + "loss": 0.86740094, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.19604492, + "step": 1937, + "time_per_iteration": 2.5564050674438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129561, + "balance_loss_mlp": 1.11017799, + "epoch": 0.3728357060407849, + "flos": 542861180928.0, + "grad_norm": 0.07798966628460335, + "language_loss": 0.80875593, + "learning_rate": 0.0007221799133834861, + "loss": 0.82005155, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.19360352, + "step": 1938, + "time_per_iteration": 2.6535797119140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128571, + "balance_loss_mlp": 1.10997486, + "epoch": 0.3730280877260485, + "flos": 433571802624.0, + "grad_norm": 0.20771096851505863, + "language_loss": 0.81190193, + "learning_rate": 0.00072190077559468, + "loss": 0.82318759, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.18591309, + "step": 1939, + "time_per_iteration": 2.5281853675842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119884, + "balance_loss_mlp": 1.10124016, + "epoch": 0.37322046941131204, + "flos": 531485535744.0, + "grad_norm": 0.07206730115622964, + "language_loss": 0.89147639, + "learning_rate": 0.0007216215516563527, + "loss": 0.90267527, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.18640137, + "step": 1940, + "time_per_iteration": 2.7357096672058105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112047, + "balance_loss_mlp": 1.1024456, + "epoch": 0.3734128510965756, + "flos": 531549775872.0, + "grad_norm": 0.09123969930056855, + "language_loss": 0.839782, + "learning_rate": 0.0007213422416769083, + "loss": 0.8509866, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.18029785, + "step": 1941, + "time_per_iteration": 2.6104605197906494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119536, + "balance_loss_mlp": 1.10109389, + "epoch": 0.37360523278183916, + "flos": 500442126336.0, + "grad_norm": 0.07207094919122449, + "language_loss": 0.75049472, + "learning_rate": 0.0007210628457647849, + "loss": 0.76169002, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.18444824, + "step": 1942, + "time_per_iteration": 2.5805821418762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129118, + "balance_loss_mlp": 1.11117733, + "epoch": 0.37379761446710275, + "flos": 547943491584.0, + "grad_norm": 0.10610035509825085, + "language_loss": 0.78376162, + "learning_rate": 0.000720783364028453, + "loss": 0.79505277, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.17956543, + "step": 1943, + "time_per_iteration": 2.780245542526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140529, + "balance_loss_mlp": 1.1218369, + "epoch": 0.3739899961523663, + "flos": 475761060864.0, + "grad_norm": 0.07224730964326329, + "language_loss": 0.87268645, + "learning_rate": 0.0007205037965764177, + "loss": 0.88409173, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.18688965, + "step": 1944, + "time_per_iteration": 2.5735671520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151311, + "balance_loss_mlp": 1.13291705, + "epoch": 0.37418237783762986, + "flos": 611915668992.0, + "grad_norm": 0.07659834869138271, + "language_loss": 0.8526088, + "learning_rate": 0.0007202241435172161, + "loss": 0.86412191, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.18408203, + "step": 1945, + "time_per_iteration": 2.7935566902160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126679, + "balance_loss_mlp": 1.10871434, + "epoch": 0.3743747595228934, + "flos": 766287682560.0, + "grad_norm": 0.3794268789868596, + "language_loss": 0.88413203, + "learning_rate": 0.0007199444049594198, + "loss": 0.89539886, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.17956543, + "step": 1946, + "time_per_iteration": 2.995715379714966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127316, + "balance_loss_mlp": 1.10844493, + "epoch": 0.374567141208157, + "flos": 524394155520.0, + "grad_norm": 0.0746444377907342, + "language_loss": 0.83035469, + "learning_rate": 0.0007196645810116322, + "loss": 0.8416279, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.1887207, + "step": 1947, + "time_per_iteration": 2.766355037689209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142049, + "balance_loss_mlp": 1.12292802, + "epoch": 0.37475952289342057, + "flos": 681375421440.0, + "grad_norm": 0.07850495494132069, + "language_loss": 0.83822554, + "learning_rate": 0.0007193846717824912, + "loss": 0.84964609, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.19104004, + "step": 1948, + "time_per_iteration": 2.925459623336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133743, + "balance_loss_mlp": 1.11488414, + "epoch": 0.3749519045786841, + "flos": 460291299840.0, + "grad_norm": 0.08022476151722048, + "language_loss": 0.88327885, + "learning_rate": 0.0007191046773806669, + "loss": 0.89461625, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.18859863, + "step": 1949, + "time_per_iteration": 2.5894553661346436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123414, + "balance_loss_mlp": 1.10373282, + "epoch": 0.3751442862639477, + "flos": 954853443072.0, + "grad_norm": 0.08918312945621011, + "language_loss": 0.83225584, + "learning_rate": 0.0007188245979148631, + "loss": 0.84349, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.19665527, + "step": 1950, + "time_per_iteration": 3.159851551055908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126856, + "balance_loss_mlp": 1.1067214, + "epoch": 0.3753366679492112, + "flos": 527747125248.0, + "grad_norm": 0.11158799296642749, + "language_loss": 0.87878865, + "learning_rate": 0.0007185444334938157, + "loss": 0.89005721, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.20129395, + "step": 1951, + "time_per_iteration": 2.7033133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111192, + "balance_loss_mlp": 1.09180903, + "epoch": 0.3755290496344748, + "flos": 521797386240.0, + "grad_norm": 0.09975748916923241, + "language_loss": 0.8500011, + "learning_rate": 0.0007182641842262947, + "loss": 0.86111307, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.19372559, + "step": 1952, + "time_per_iteration": 2.626728057861328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108209, + "balance_loss_mlp": 1.08878958, + "epoch": 0.37572143131973834, + "flos": 621121830912.0, + "grad_norm": 0.09334076595597436, + "language_loss": 0.77694595, + "learning_rate": 0.0007179838502211022, + "loss": 0.78802806, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.19421387, + "step": 1953, + "time_per_iteration": 2.8748068809509277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106763, + "balance_loss_mlp": 1.08678353, + "epoch": 0.37591381300500193, + "flos": 770962530816.0, + "grad_norm": 0.0737363931585354, + "language_loss": 0.86213845, + "learning_rate": 0.0007177034315870738, + "loss": 0.87320614, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.19970703, + "step": 1954, + "time_per_iteration": 2.961113929748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110959, + "balance_loss_mlp": 1.08933675, + "epoch": 0.37610619469026546, + "flos": 520448343552.0, + "grad_norm": 0.08944632819393537, + "language_loss": 0.91041321, + "learning_rate": 0.0007174229284330773, + "loss": 0.92150909, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.20239258, + "step": 1955, + "time_per_iteration": 2.6537580490112305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113343, + "balance_loss_mlp": 1.09273195, + "epoch": 0.37629857637552905, + "flos": 598812880896.0, + "grad_norm": 0.10287168416480917, + "language_loss": 0.86629105, + "learning_rate": 0.0007171423408680141, + "loss": 0.87742448, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.20605469, + "step": 1956, + "time_per_iteration": 2.814793348312378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106345, + "balance_loss_mlp": 1.08584106, + "epoch": 0.37649095806079264, + "flos": 564952817664.0, + "grad_norm": 0.10543893351617999, + "language_loss": 0.89721847, + "learning_rate": 0.0007168616690008176, + "loss": 0.90828192, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.20495605, + "step": 1957, + "time_per_iteration": 2.6851284503936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098402, + "balance_loss_mlp": 1.07823181, + "epoch": 0.37668333974605617, + "flos": 592470360576.0, + "grad_norm": 0.08262297472790796, + "language_loss": 0.85860795, + "learning_rate": 0.0007165809129404545, + "loss": 0.86959195, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.20166016, + "step": 1958, + "time_per_iteration": 2.756485939025879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106731, + "balance_loss_mlp": 1.08695424, + "epoch": 0.37687572143131975, + "flos": 419478248448.0, + "grad_norm": 0.08262792958454514, + "language_loss": 0.85935986, + "learning_rate": 0.0007163000727959239, + "loss": 0.87042725, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.19775391, + "step": 1959, + "time_per_iteration": 2.525435447692871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070977, + "balance_loss_mlp": 1.06053388, + "epoch": 0.3770681031165833, + "flos": 1357262148096.0, + "grad_norm": 0.03547956764144784, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.79030049, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.10449219, + "step": 1960, + "time_per_iteration": 4.89080286026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149436, + "balance_loss_mlp": 1.13035011, + "epoch": 0.3772604848018469, + "flos": 644903534592.0, + "grad_norm": 0.06578569091259368, + "language_loss": 0.84412438, + "learning_rate": 0.00071573814069052, + "loss": 0.85561872, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.19067383, + "step": 1961, + "time_per_iteration": 2.9070186614990234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173736, + "balance_loss_mlp": 1.15444791, + "epoch": 0.3774528664871104, + "flos": 901651585536.0, + "grad_norm": 0.18582927476215966, + "language_loss": 0.87659955, + "learning_rate": 0.0007154570489478081, + "loss": 0.8883369, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.19274902, + "step": 1962, + "time_per_iteration": 3.2049379348754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173644, + "balance_loss_mlp": 1.15447557, + "epoch": 0.377645248172374, + "flos": 788065459200.0, + "grad_norm": 0.14724331795419812, + "language_loss": 0.86293024, + "learning_rate": 0.0007151758735572514, + "loss": 0.87466669, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.19152832, + "step": 1963, + "time_per_iteration": 3.0349316596984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142067, + "balance_loss_mlp": 1.12338686, + "epoch": 0.3778376298576376, + "flos": 586718111232.0, + "grad_norm": 0.0939989250476118, + "language_loss": 0.80074733, + "learning_rate": 0.0007148946146280119, + "loss": 0.812168, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.18676758, + "step": 1964, + "time_per_iteration": 2.8144431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048428, + "balance_loss_mlp": 1.03836632, + "epoch": 0.3780300115429011, + "flos": 1396743782400.0, + "grad_norm": 0.021748901232604565, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73240578, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.10058594, + "step": 1965, + "time_per_iteration": 4.930070400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055709, + "balance_loss_mlp": 1.04559994, + "epoch": 0.3782223932281647, + "flos": 1357672555008.0, + "grad_norm": 0.023739163757957975, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76397657, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.10107422, + "step": 1966, + "time_per_iteration": 4.934873580932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137343, + "balance_loss_mlp": 1.11776876, + "epoch": 0.37841477491342823, + "flos": 704151304704.0, + "grad_norm": 0.08213272343580422, + "language_loss": 0.83802509, + "learning_rate": 0.0007140503377003022, + "loss": 0.84939849, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.19555664, + "step": 1967, + "time_per_iteration": 3.0881879329681396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139491, + "balance_loss_mlp": 1.11967874, + "epoch": 0.3786071565986918, + "flos": 529115991552.0, + "grad_norm": 0.1174729362064234, + "language_loss": 0.84845448, + "learning_rate": 0.000713768745708599, + "loss": 0.85984945, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.19799805, + "step": 1968, + "time_per_iteration": 2.635103225708008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150253, + "balance_loss_mlp": 1.12999952, + "epoch": 0.37879953828395535, + "flos": 993277126656.0, + "grad_norm": 0.12024050748438767, + "language_loss": 0.77237123, + "learning_rate": 0.0007134870707245085, + "loss": 0.7838738, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.20251465, + "step": 1969, + "time_per_iteration": 3.2765696048736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137246, + "balance_loss_mlp": 1.11786246, + "epoch": 0.37899191996921894, + "flos": 626644283904.0, + "grad_norm": 0.12719814054785675, + "language_loss": 0.84604537, + "learning_rate": 0.0007132053128573864, + "loss": 0.85741782, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.19372559, + "step": 1970, + "time_per_iteration": 2.741464614868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134845, + "balance_loss_mlp": 1.11534226, + "epoch": 0.37918430165448247, + "flos": 686307230208.0, + "grad_norm": 0.07594331821705162, + "language_loss": 0.83660662, + "learning_rate": 0.0007129234722166211, + "loss": 0.84795505, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.19482422, + "step": 1971, + "time_per_iteration": 2.879617214202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150531, + "balance_loss_mlp": 1.13185048, + "epoch": 0.37937668333974606, + "flos": 475622668800.0, + "grad_norm": 0.10702357186833415, + "language_loss": 0.90689349, + "learning_rate": 0.0007126415489116328, + "loss": 0.91839886, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.18676758, + "step": 1972, + "time_per_iteration": 2.7060065269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177798, + "balance_loss_mlp": 1.15965438, + "epoch": 0.37956906502500964, + "flos": 707580997632.0, + "grad_norm": 0.08068810601979462, + "language_loss": 0.81252205, + "learning_rate": 0.0007123595430518736, + "loss": 0.82429999, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.18151855, + "step": 1973, + "time_per_iteration": 2.872903823852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217278, + "balance_loss_mlp": 1.19866943, + "epoch": 0.3797614467102732, + "flos": 426648549888.0, + "grad_norm": 0.10171747912447733, + "language_loss": 0.86328602, + "learning_rate": 0.0007120774547468282, + "loss": 0.87545884, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.18591309, + "step": 1974, + "time_per_iteration": 2.5397889614105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240679, + "balance_loss_mlp": 1.22244012, + "epoch": 0.37995382839553676, + "flos": 481846620672.0, + "grad_norm": 0.14549097169765346, + "language_loss": 0.81380564, + "learning_rate": 0.0007117952841060128, + "loss": 0.82621247, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.18249512, + "step": 1975, + "time_per_iteration": 2.6751859188079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203512, + "balance_loss_mlp": 1.18491578, + "epoch": 0.3801462100808003, + "flos": 560562094080.0, + "grad_norm": 0.08096849874764685, + "language_loss": 0.8358916, + "learning_rate": 0.0007115130312389756, + "loss": 0.84792668, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.18579102, + "step": 1976, + "time_per_iteration": 2.6997742652893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194849, + "balance_loss_mlp": 1.17584705, + "epoch": 0.3803385917660639, + "flos": 464936412672.0, + "grad_norm": 0.0836403104795401, + "language_loss": 0.78931224, + "learning_rate": 0.0007112306962552973, + "loss": 0.80126077, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.18994141, + "step": 1977, + "time_per_iteration": 2.6066653728485107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177391, + "balance_loss_mlp": 1.15869951, + "epoch": 0.3805309734513274, + "flos": 521871538176.0, + "grad_norm": 0.0835848576107689, + "language_loss": 0.84830624, + "learning_rate": 0.0007109482792645896, + "loss": 0.86008012, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.18676758, + "step": 1978, + "time_per_iteration": 2.7217793464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163855, + "balance_loss_mlp": 1.14444792, + "epoch": 0.380723355136591, + "flos": 591412783104.0, + "grad_norm": 0.18446881037378643, + "language_loss": 0.83627468, + "learning_rate": 0.0007106657803764969, + "loss": 0.84791327, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.19384766, + "step": 1979, + "time_per_iteration": 2.7421200275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142527, + "balance_loss_mlp": 1.12388265, + "epoch": 0.38091573682185453, + "flos": 622685988864.0, + "grad_norm": 0.07567906441681438, + "language_loss": 0.81599772, + "learning_rate": 0.0007103831997006948, + "loss": 0.82742298, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.18652344, + "step": 1980, + "time_per_iteration": 2.7659311294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137326, + "balance_loss_mlp": 1.11770415, + "epoch": 0.3811081185071181, + "flos": 569007286272.0, + "grad_norm": 0.10880870313335556, + "language_loss": 0.85352248, + "learning_rate": 0.0007101005373468908, + "loss": 0.86489582, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.19628906, + "step": 1981, + "time_per_iteration": 2.8786306381225586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130638, + "balance_loss_mlp": 1.11189866, + "epoch": 0.3813005001923817, + "flos": 584837895168.0, + "grad_norm": 0.09193767407328653, + "language_loss": 0.86793411, + "learning_rate": 0.0007098177934248242, + "loss": 0.87924051, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.18737793, + "step": 1982, + "time_per_iteration": 2.7491414546966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112619, + "balance_loss_mlp": 1.10644913, + "epoch": 0.38149288187764524, + "flos": 621591335424.0, + "grad_norm": 0.08063581171786138, + "language_loss": 0.85497284, + "learning_rate": 0.0007095349680442661, + "loss": 0.86623472, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.1973877, + "step": 1983, + "time_per_iteration": 2.8513927459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123414, + "balance_loss_mlp": 1.10408998, + "epoch": 0.3816852635629088, + "flos": 570690012672.0, + "grad_norm": 0.1315455004610476, + "language_loss": 0.79132575, + "learning_rate": 0.0007092520613150188, + "loss": 0.80255985, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.19299316, + "step": 1984, + "time_per_iteration": 2.7137770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122797, + "balance_loss_mlp": 1.1034615, + "epoch": 0.38187764524817236, + "flos": 565585307136.0, + "grad_norm": 0.07682315674204161, + "language_loss": 0.81457669, + "learning_rate": 0.0007089690733469165, + "loss": 0.82580465, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.1932373, + "step": 1985, + "time_per_iteration": 2.7019522190093994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153334, + "balance_loss_mlp": 1.13452315, + "epoch": 0.38207002693343595, + "flos": 631225156608.0, + "grad_norm": 0.10399563311309594, + "language_loss": 0.82318014, + "learning_rate": 0.000708686004249825, + "loss": 0.83471346, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.18811035, + "step": 1986, + "time_per_iteration": 2.797624111175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115288, + "balance_loss_mlp": 1.13355637, + "epoch": 0.3822624086186995, + "flos": 548773843968.0, + "grad_norm": 0.07772659738204864, + "language_loss": 0.91482198, + "learning_rate": 0.0007084028541336413, + "loss": 0.92635083, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.19299316, + "step": 1987, + "time_per_iteration": 2.7236177921295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159354, + "balance_loss_mlp": 1.13969636, + "epoch": 0.38245479030396307, + "flos": 613870036992.0, + "grad_norm": 0.13308271196687566, + "language_loss": 0.86052763, + "learning_rate": 0.0007081196231082942, + "loss": 0.87212121, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.19641113, + "step": 1988, + "time_per_iteration": 2.837611198425293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141061, + "balance_loss_mlp": 1.12171304, + "epoch": 0.38264717198922665, + "flos": 668089824768.0, + "grad_norm": 0.1253750556073725, + "language_loss": 0.79903424, + "learning_rate": 0.0007078363112837436, + "loss": 0.81044483, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.19335938, + "step": 1989, + "time_per_iteration": 2.8450546264648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135085, + "balance_loss_mlp": 1.11594021, + "epoch": 0.3828395536744902, + "flos": 454754165760.0, + "grad_norm": 0.06314586189395412, + "language_loss": 0.8480984, + "learning_rate": 0.000707552918769981, + "loss": 0.85944927, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.19128418, + "step": 1990, + "time_per_iteration": 2.5055301189422607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117815, + "balance_loss_mlp": 1.09837222, + "epoch": 0.3830319353597538, + "flos": 499448788992.0, + "grad_norm": 0.09018786790446763, + "language_loss": 0.8355186, + "learning_rate": 0.000707269445677029, + "loss": 0.84669679, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.19433594, + "step": 1991, + "time_per_iteration": 2.790247917175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120171, + "balance_loss_mlp": 1.10065699, + "epoch": 0.3832243170450173, + "flos": 744121893888.0, + "grad_norm": 0.07803627169317769, + "language_loss": 0.8551231, + "learning_rate": 0.0007069858921149416, + "loss": 0.86632484, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.19494629, + "step": 1992, + "time_per_iteration": 2.9850950241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128282, + "balance_loss_mlp": 1.10929155, + "epoch": 0.3834166987302809, + "flos": 578218590720.0, + "grad_norm": 0.08439673282063015, + "language_loss": 0.86369681, + "learning_rate": 0.0007067022581938043, + "loss": 0.87497962, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.18981934, + "step": 1993, + "time_per_iteration": 2.838817834854126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120054, + "balance_loss_mlp": 1.10115981, + "epoch": 0.3836090804155444, + "flos": 536476442112.0, + "grad_norm": 0.10464401531680585, + "language_loss": 0.83076423, + "learning_rate": 0.0007064185440237334, + "loss": 0.84196478, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.18884277, + "step": 1994, + "time_per_iteration": 2.7403006553649902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113897, + "balance_loss_mlp": 1.09485924, + "epoch": 0.383801462100808, + "flos": 601879154688.0, + "grad_norm": 0.07520001194530918, + "language_loss": 0.8432954, + "learning_rate": 0.0007061347497148764, + "loss": 0.85443437, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.19018555, + "step": 1995, + "time_per_iteration": 2.797116994857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117711, + "balance_loss_mlp": 1.0988524, + "epoch": 0.38399384378607154, + "flos": 572701280256.0, + "grad_norm": 0.10442861201560887, + "language_loss": 0.86312652, + "learning_rate": 0.0007058508753774122, + "loss": 0.87430364, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.18847656, + "step": 1996, + "time_per_iteration": 2.708909511566162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111759, + "balance_loss_mlp": 1.098791, + "epoch": 0.38418622547133513, + "flos": 536765709312.0, + "grad_norm": 0.07371207674818485, + "language_loss": 0.86599022, + "learning_rate": 0.0007055669211215505, + "loss": 0.87716615, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.18786621, + "step": 1997, + "time_per_iteration": 2.639425277709961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129297, + "balance_loss_mlp": 1.11073565, + "epoch": 0.3843786071565987, + "flos": 572940988416.0, + "grad_norm": 0.10349237512498541, + "language_loss": 0.77684987, + "learning_rate": 0.0007052828870575322, + "loss": 0.7881428, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.1854248, + "step": 1998, + "time_per_iteration": 2.6582653522491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141105, + "balance_loss_mlp": 1.12290192, + "epoch": 0.38457098884186225, + "flos": 728703889920.0, + "grad_norm": 0.06112561257491971, + "language_loss": 0.8669157, + "learning_rate": 0.0007049987732956291, + "loss": 0.87832677, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.18212891, + "step": 1999, + "time_per_iteration": 2.9868295192718506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130964, + "balance_loss_mlp": 1.11211705, + "epoch": 0.38476337052712584, + "flos": 583422041088.0, + "grad_norm": 0.05929570453342199, + "language_loss": 0.82587528, + "learning_rate": 0.0007047145799461439, + "loss": 0.83718491, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.18835449, + "step": 2000, + "time_per_iteration": 2.8687593936920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136368, + "balance_loss_mlp": 1.11759257, + "epoch": 0.38495575221238937, + "flos": 553060680192.0, + "grad_norm": 0.08059531994541343, + "language_loss": 0.82050723, + "learning_rate": 0.00070443030711941, + "loss": 0.83187091, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.18762207, + "step": 2001, + "time_per_iteration": 2.7824347019195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113557, + "balance_loss_mlp": 1.11636579, + "epoch": 0.38514813389765296, + "flos": 654473115648.0, + "grad_norm": 0.09146293400396303, + "language_loss": 0.8213051, + "learning_rate": 0.0007041459549257924, + "loss": 0.83266079, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.19189453, + "step": 2002, + "time_per_iteration": 2.8634302616119385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137137, + "balance_loss_mlp": 1.11758697, + "epoch": 0.3853405155829165, + "flos": 868100239872.0, + "grad_norm": 0.08512403296601297, + "language_loss": 0.78107333, + "learning_rate": 0.0007038615234756859, + "loss": 0.79244471, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.1953125, + "step": 2003, + "time_per_iteration": 3.2058236598968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136926, + "balance_loss_mlp": 1.11745918, + "epoch": 0.3855328972681801, + "flos": 546424123392.0, + "grad_norm": 0.07973278859066837, + "language_loss": 0.840294, + "learning_rate": 0.000703577012879517, + "loss": 0.85166335, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.19458008, + "step": 2004, + "time_per_iteration": 2.7286102771759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144109, + "balance_loss_mlp": 1.12510681, + "epoch": 0.3857252789534436, + "flos": 534074964480.0, + "grad_norm": 0.07975228006523119, + "language_loss": 0.88714588, + "learning_rate": 0.0007032924232477423, + "loss": 0.89858699, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.19006348, + "step": 2005, + "time_per_iteration": 2.6980981826782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136738, + "balance_loss_mlp": 1.11721206, + "epoch": 0.3859176606387072, + "flos": 491764566528.0, + "grad_norm": 0.08525396844891328, + "language_loss": 0.8036226, + "learning_rate": 0.0007030077546908493, + "loss": 0.81499004, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.19506836, + "step": 2006, + "time_per_iteration": 2.6433420181274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01225281, + "balance_loss_mlp": 1.21288347, + "epoch": 0.3861100423239708, + "flos": 1487052214272.0, + "grad_norm": 0.07049383229006134, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84289944, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.12402344, + "step": 2007, + "time_per_iteration": 4.82226037979126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113221, + "balance_loss_mlp": 1.11288631, + "epoch": 0.3863024240092343, + "flos": 473732540928.0, + "grad_norm": 0.07446306607004384, + "language_loss": 0.78622216, + "learning_rate": 0.0007024381812438117, + "loss": 0.7975443, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.19299316, + "step": 2008, + "time_per_iteration": 2.52738618850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128683, + "balance_loss_mlp": 1.10928798, + "epoch": 0.3864948056944979, + "flos": 716601779712.0, + "grad_norm": 0.09860455371344472, + "language_loss": 0.82941681, + "learning_rate": 0.0007021532765747951, + "loss": 0.84070361, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.19396973, + "step": 2009, + "time_per_iteration": 3.007847309112549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135681, + "balance_loss_mlp": 1.115821, + "epoch": 0.38668718737976143, + "flos": 727631631360.0, + "grad_norm": 0.08526755269117656, + "language_loss": 0.79078948, + "learning_rate": 0.0007018682934229162, + "loss": 0.80214632, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.1986084, + "step": 2010, + "time_per_iteration": 2.9435882568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122545, + "balance_loss_mlp": 1.10262537, + "epoch": 0.386879569065025, + "flos": 525471556608.0, + "grad_norm": 0.06758132101189684, + "language_loss": 0.82111001, + "learning_rate": 0.0007015832318988152, + "loss": 0.83233541, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.19909668, + "step": 2011, + "time_per_iteration": 2.6552624702453613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043733, + "balance_loss_mlp": 1.03133512, + "epoch": 0.38707195075028855, + "flos": 1527771663360.0, + "grad_norm": 0.01882295684379882, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74933803, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.12402344, + "step": 2012, + "time_per_iteration": 5.011860609054565 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111441, + "balance_loss_mlp": 1.09159219, + "epoch": 0.38726433243555214, + "flos": 557313011712.0, + "grad_norm": 0.07301389252885741, + "language_loss": 0.84162498, + "learning_rate": 0.0007010128741766604, + "loss": 0.85273933, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.19836426, + "step": 2013, + "time_per_iteration": 2.766516923904419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111771, + "balance_loss_mlp": 1.09080195, + "epoch": 0.38745671412081567, + "flos": 553695740928.0, + "grad_norm": 0.10834212581561939, + "language_loss": 0.84428859, + "learning_rate": 0.0007007275782000391, + "loss": 0.85540634, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.20983887, + "step": 2014, + "time_per_iteration": 2.6184933185577393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108065, + "balance_loss_mlp": 1.08796668, + "epoch": 0.38764909580607926, + "flos": 458408512512.0, + "grad_norm": 0.07735715793711462, + "language_loss": 0.8448838, + "learning_rate": 0.0007004422042940605, + "loss": 0.85596442, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.20092773, + "step": 2015, + "time_per_iteration": 2.5543320178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109418, + "balance_loss_mlp": 1.08941483, + "epoch": 0.38784147749134285, + "flos": 522229814784.0, + "grad_norm": 0.08270873816767256, + "language_loss": 0.89443475, + "learning_rate": 0.0007001567525695169, + "loss": 0.9055289, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.19995117, + "step": 2016, + "time_per_iteration": 2.6072936058044434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106093, + "balance_loss_mlp": 1.08593512, + "epoch": 0.3880338591766064, + "flos": 666036338688.0, + "grad_norm": 0.06162053071135558, + "language_loss": 0.83763885, + "learning_rate": 0.0006998712231372303, + "loss": 0.84869981, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.20166016, + "step": 2017, + "time_per_iteration": 3.0785679817199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110727, + "balance_loss_mlp": 1.08730268, + "epoch": 0.38822624086186996, + "flos": 593962564608.0, + "grad_norm": 0.06865572989075389, + "language_loss": 0.86015558, + "learning_rate": 0.0006995856161080532, + "loss": 0.87122822, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.19958496, + "step": 2018, + "time_per_iteration": 2.8914577960968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112506, + "balance_loss_mlp": 1.09202576, + "epoch": 0.3884186225471335, + "flos": 612540817920.0, + "grad_norm": 0.07931380391873609, + "language_loss": 0.82694459, + "learning_rate": 0.0006992999315928679, + "loss": 0.83806968, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.20483398, + "step": 2019, + "time_per_iteration": 2.7892749309539795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110031, + "balance_loss_mlp": 1.08994412, + "epoch": 0.3886110042323971, + "flos": 607038188544.0, + "grad_norm": 0.08754557392654386, + "language_loss": 0.85419971, + "learning_rate": 0.0006990141697025871, + "loss": 0.8653, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.20080566, + "step": 2020, + "time_per_iteration": 2.7910003662109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_mlp": 1.02712286, + "epoch": 0.3888033859176606, + "flos": 1528067897856.0, + "grad_norm": 0.02439767662091094, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77398252, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.11474609, + "step": 2021, + "time_per_iteration": 4.809415340423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10596848, + "epoch": 0.3889957676029242, + "flos": 692449689600.0, + "grad_norm": 0.0885537285439357, + "language_loss": 0.82239556, + "learning_rate": 0.0006984424142405392, + "loss": 0.83365172, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.19641113, + "step": 2022, + "time_per_iteration": 2.8510379791259766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124515, + "balance_loss_mlp": 1.10540605, + "epoch": 0.3891881492881878, + "flos": 515187993600.0, + "grad_norm": 0.08944143564846467, + "language_loss": 0.82328045, + "learning_rate": 0.0006981564208907474, + "loss": 0.83452559, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.19091797, + "step": 2023, + "time_per_iteration": 2.6450161933898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125021, + "balance_loss_mlp": 1.10580468, + "epoch": 0.3893805309734513, + "flos": 629050904064.0, + "grad_norm": 0.06744861114448035, + "language_loss": 0.89889395, + "learning_rate": 0.0006978703506098102, + "loss": 0.91014421, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.19189453, + "step": 2024, + "time_per_iteration": 2.845273494720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142716, + "balance_loss_mlp": 1.12338066, + "epoch": 0.3895729126587149, + "flos": 544155895296.0, + "grad_norm": 0.22805579315722818, + "language_loss": 0.87903351, + "learning_rate": 0.00069758420350879, + "loss": 0.89046067, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.1932373, + "step": 2025, + "time_per_iteration": 2.673590898513794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147111, + "balance_loss_mlp": 1.12706041, + "epoch": 0.38976529434397844, + "flos": 618270672384.0, + "grad_norm": 0.08766781252639666, + "language_loss": 0.85837841, + "learning_rate": 0.000697297979698779, + "loss": 0.86984944, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.20043945, + "step": 2026, + "time_per_iteration": 2.7639670372009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146919, + "balance_loss_mlp": 1.12766671, + "epoch": 0.38995767602924203, + "flos": 834882577920.0, + "grad_norm": 0.06921765861152807, + "language_loss": 0.83379734, + "learning_rate": 0.0006970116792908992, + "loss": 0.84526652, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.19226074, + "step": 2027, + "time_per_iteration": 3.1537575721740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165828, + "balance_loss_mlp": 1.14574075, + "epoch": 0.39015005771450556, + "flos": 541603542528.0, + "grad_norm": 0.10608539967442848, + "language_loss": 0.81162727, + "learning_rate": 0.000696725302396302, + "loss": 0.82328546, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.20080566, + "step": 2028, + "time_per_iteration": 2.713486671447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169814, + "balance_loss_mlp": 1.14985871, + "epoch": 0.39034243939976915, + "flos": 1007509072896.0, + "grad_norm": 0.08953149679914804, + "language_loss": 0.85771465, + "learning_rate": 0.0006964388491261692, + "loss": 0.86941278, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.19946289, + "step": 2029, + "time_per_iteration": 3.2685461044311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117313, + "balance_loss_mlp": 1.15280437, + "epoch": 0.3905348210850327, + "flos": 679025700864.0, + "grad_norm": 0.07138064393758646, + "language_loss": 0.87465048, + "learning_rate": 0.0006961523195917114, + "loss": 0.88638175, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.20324707, + "step": 2030, + "time_per_iteration": 2.8363735675811768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173533, + "balance_loss_mlp": 1.15370905, + "epoch": 0.39072720277029627, + "flos": 548882500608.0, + "grad_norm": 0.07919234366723153, + "language_loss": 0.78095168, + "learning_rate": 0.0006958657139041696, + "loss": 0.792687, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.19812012, + "step": 2031, + "time_per_iteration": 2.7535581588745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093882, + "balance_loss_mlp": 1.0820564, + "epoch": 0.39091958445555985, + "flos": 1547737860096.0, + "grad_norm": 0.028372833662772774, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77806854, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.11816406, + "step": 2032, + "time_per_iteration": 4.918071508407593 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01162947, + "balance_loss_mlp": 1.14219236, + "epoch": 0.3911119661408234, + "flos": 504002497536.0, + "grad_norm": 0.08595509799025135, + "language_loss": 0.78080893, + "learning_rate": 0.0006952922745149434, + "loss": 0.79243839, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.2076416, + "step": 2033, + "time_per_iteration": 2.6598660945892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160858, + "balance_loss_mlp": 1.14035416, + "epoch": 0.391304347826087, + "flos": 557238859776.0, + "grad_norm": 0.06804618944659446, + "language_loss": 0.87450963, + "learning_rate": 0.000695005441035888, + "loss": 0.88611823, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.20507812, + "step": 2034, + "time_per_iteration": 2.6846048831939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_mlp": 1.06218028, + "epoch": 0.3914967295113505, + "flos": 1500034235904.0, + "grad_norm": 0.025244772676945967, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.7479701, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.11376953, + "step": 2035, + "time_per_iteration": 4.866973638534546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.12698257, + "epoch": 0.3916891111966141, + "flos": 707037341184.0, + "grad_norm": 0.06481204645981475, + "language_loss": 0.80968261, + "learning_rate": 0.0006944315470656863, + "loss": 0.82115912, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.20678711, + "step": 2036, + "time_per_iteration": 2.973759412765503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139407, + "balance_loss_mlp": 1.11935592, + "epoch": 0.3918814928818776, + "flos": 556349409792.0, + "grad_norm": 0.08143475646221604, + "language_loss": 0.90850043, + "learning_rate": 0.000694144486797345, + "loss": 0.91989452, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.20043945, + "step": 2037, + "time_per_iteration": 2.736645221710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042479, + "balance_loss_mlp": 1.03184605, + "epoch": 0.3920738745671412, + "flos": 1538610992640.0, + "grad_norm": 0.02072601949350613, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80562913, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.10644531, + "step": 2038, + "time_per_iteration": 4.651543140411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130224, + "balance_loss_mlp": 1.11029196, + "epoch": 0.39226625625240474, + "flos": 498836123136.0, + "grad_norm": 0.08780788201299033, + "language_loss": 0.89056122, + "learning_rate": 0.0006935701402514156, + "loss": 0.90186346, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.19921875, + "step": 2039, + "time_per_iteration": 2.610884666442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025416, + "balance_loss_mlp": 1.01525903, + "epoch": 0.39245863793766833, + "flos": 1347260138496.0, + "grad_norm": 0.013600241372588764, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.74060309, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.1015625, + "step": 2040, + "time_per_iteration": 4.982971906661987 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139694, + "balance_loss_mlp": 1.12033463, + "epoch": 0.3926510196229319, + "flos": 1346113022976.0, + "grad_norm": 0.07758123210342138, + "language_loss": 0.84211379, + "learning_rate": 0.0006929954931031422, + "loss": 0.85351074, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.19348145, + "step": 2041, + "time_per_iteration": 3.722700595855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143114, + "balance_loss_mlp": 1.12322998, + "epoch": 0.39284340130819545, + "flos": 499587181056.0, + "grad_norm": 0.05684242147097161, + "language_loss": 0.88287592, + "learning_rate": 0.0006927080570819805, + "loss": 0.89430702, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.19885254, + "step": 2042, + "time_per_iteration": 2.6228466033935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146557, + "balance_loss_mlp": 1.12712598, + "epoch": 0.39303578299345904, + "flos": 520329775104.0, + "grad_norm": 0.09880041485830528, + "language_loss": 0.80978543, + "learning_rate": 0.0006924205462449161, + "loss": 0.82125103, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.19421387, + "step": 2043, + "time_per_iteration": 2.5959606170654297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130878, + "balance_loss_mlp": 1.11204302, + "epoch": 0.39322816467872257, + "flos": 907929865728.0, + "grad_norm": 0.07421884933278829, + "language_loss": 0.81996524, + "learning_rate": 0.0006921329607035702, + "loss": 0.83127403, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.18823242, + "step": 2044, + "time_per_iteration": 3.2492971420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112622, + "balance_loss_mlp": 1.10749173, + "epoch": 0.39342054636398616, + "flos": 517592042496.0, + "grad_norm": 0.0837559423677037, + "language_loss": 0.87882477, + "learning_rate": 0.0006918453005695938, + "loss": 0.89008695, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.18701172, + "step": 2045, + "time_per_iteration": 2.649426221847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120666, + "balance_loss_mlp": 1.10098422, + "epoch": 0.3936129280492497, + "flos": 547918898688.0, + "grad_norm": 0.0619155211719984, + "language_loss": 0.84122574, + "learning_rate": 0.0006915575659546662, + "loss": 0.85243243, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.19665527, + "step": 2046, + "time_per_iteration": 2.7105627059936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109896, + "balance_loss_mlp": 1.09044123, + "epoch": 0.3938053097345133, + "flos": 526113957888.0, + "grad_norm": 0.0891593284161872, + "language_loss": 0.80576289, + "learning_rate": 0.0006912697569704959, + "loss": 0.81686187, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.19445801, + "step": 2047, + "time_per_iteration": 2.700460910797119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117623, + "balance_loss_mlp": 1.09800088, + "epoch": 0.39399769141977686, + "flos": 471629869056.0, + "grad_norm": 0.09048948583390962, + "language_loss": 0.86559486, + "learning_rate": 0.0006909818737288205, + "loss": 0.87677109, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.19604492, + "step": 2048, + "time_per_iteration": 2.593365430831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122837, + "balance_loss_mlp": 1.10311985, + "epoch": 0.3941900731050404, + "flos": 501736840704.0, + "grad_norm": 0.0812760632256331, + "language_loss": 0.8078903, + "learning_rate": 0.000690693916341406, + "loss": 0.81911868, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.19702148, + "step": 2049, + "time_per_iteration": 2.6433444023132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114252, + "balance_loss_mlp": 1.09472609, + "epoch": 0.394382454790304, + "flos": 580862347776.0, + "grad_norm": 0.0788936263124851, + "language_loss": 0.82210761, + "learning_rate": 0.0006904058849200475, + "loss": 0.83325016, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.19506836, + "step": 2050, + "time_per_iteration": 2.7488439083099365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114662, + "balance_loss_mlp": 1.09468246, + "epoch": 0.3945748364755675, + "flos": 513819127296.0, + "grad_norm": 0.10945632429468012, + "language_loss": 0.8477484, + "learning_rate": 0.0006901177795765683, + "loss": 0.858895, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.19970703, + "step": 2051, + "time_per_iteration": 2.6071059703826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101751, + "balance_loss_mlp": 1.08223617, + "epoch": 0.3947672181608311, + "flos": 593957795328.0, + "grad_norm": 0.07628310806963638, + "language_loss": 0.81390727, + "learning_rate": 0.0006898296004228213, + "loss": 0.82492483, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.19494629, + "step": 2052, + "time_per_iteration": 2.725609540939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01195172, + "balance_loss_mlp": 1.18334627, + "epoch": 0.39495959984609463, + "flos": 1547671048704.0, + "grad_norm": 0.06244005501870815, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79321915, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.11816406, + "step": 2053, + "time_per_iteration": 4.871281862258911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111122, + "balance_loss_mlp": 1.09123778, + "epoch": 0.3951519815313582, + "flos": 496520907264.0, + "grad_norm": 0.08281763462186637, + "language_loss": 0.79986715, + "learning_rate": 0.0006892530211320763, + "loss": 0.81097841, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.19873047, + "step": 2054, + "time_per_iteration": 2.7042620182037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125901, + "balance_loss_mlp": 1.10589778, + "epoch": 0.39534436321662175, + "flos": 531191499264.0, + "grad_norm": 0.08642547559894523, + "language_loss": 0.83690774, + "learning_rate": 0.000688964621218926, + "loss": 0.8481667, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.19995117, + "step": 2055, + "time_per_iteration": 2.6359920501708984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120805, + "balance_loss_mlp": 1.10112405, + "epoch": 0.39553674490188534, + "flos": 702523279872.0, + "grad_norm": 0.10380118482872411, + "language_loss": 0.79915357, + "learning_rate": 0.0006886761479432037, + "loss": 0.81036162, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.19665527, + "step": 2056, + "time_per_iteration": 2.872950792312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122886, + "balance_loss_mlp": 1.10250163, + "epoch": 0.3957291265871489, + "flos": 409772846592.0, + "grad_norm": 0.07844536568455973, + "language_loss": 0.8461678, + "learning_rate": 0.0006883876014169045, + "loss": 0.8573966, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.20385742, + "step": 2057, + "time_per_iteration": 2.5555264949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132813, + "balance_loss_mlp": 1.11285698, + "epoch": 0.39592150827241246, + "flos": 618490556928.0, + "grad_norm": 0.08268955880836791, + "language_loss": 0.90132928, + "learning_rate": 0.000688098981752052, + "loss": 0.91265738, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.19946289, + "step": 2058, + "time_per_iteration": 2.7518441677093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134779, + "balance_loss_mlp": 1.11504984, + "epoch": 0.39611388995767605, + "flos": 821332680192.0, + "grad_norm": 0.09934928750763956, + "language_loss": 0.80161107, + "learning_rate": 0.0006878102890606982, + "loss": 0.81295884, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.19726562, + "step": 2059, + "time_per_iteration": 3.098393678665161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122363, + "balance_loss_mlp": 1.10231209, + "epoch": 0.3963062716429396, + "flos": 492224159232.0, + "grad_norm": 0.08965795352869743, + "language_loss": 0.80914015, + "learning_rate": 0.0006875215234549239, + "loss": 0.82036376, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.20043945, + "step": 2060, + "time_per_iteration": 2.591871976852417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112284, + "balance_loss_mlp": 1.10284913, + "epoch": 0.39649865332820317, + "flos": 584739150336.0, + "grad_norm": 0.08963098282996143, + "language_loss": 0.85349464, + "learning_rate": 0.0006872326850468376, + "loss": 0.86472309, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.19995117, + "step": 2061, + "time_per_iteration": 2.7322757244110107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121862, + "balance_loss_mlp": 1.10210919, + "epoch": 0.3966910350134667, + "flos": 458564156928.0, + "grad_norm": 0.08450203568488315, + "language_loss": 0.78602254, + "learning_rate": 0.0006869437739485762, + "loss": 0.79724109, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.19750977, + "step": 2062, + "time_per_iteration": 2.679453134536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_mlp": 1.09274697, + "epoch": 0.3968834166987303, + "flos": 508632929280.0, + "grad_norm": 0.07578248331540363, + "language_loss": 0.92750496, + "learning_rate": 0.0006866547902723053, + "loss": 0.93862933, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.19677734, + "step": 2063, + "time_per_iteration": 2.680661201477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100055, + "balance_loss_mlp": 1.08058822, + "epoch": 0.3970757983839938, + "flos": 572627128320.0, + "grad_norm": 0.07543651474129125, + "language_loss": 0.80317062, + "learning_rate": 0.000686365734130218, + "loss": 0.8141712, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.19458008, + "step": 2064, + "time_per_iteration": 2.695892095565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106834, + "balance_loss_mlp": 1.08669949, + "epoch": 0.3972681800692574, + "flos": 481629307392.0, + "grad_norm": 0.08078876442086359, + "language_loss": 0.84065503, + "learning_rate": 0.000686076605634536, + "loss": 0.85172331, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.20129395, + "step": 2065, + "time_per_iteration": 2.642617702484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113253, + "balance_loss_mlp": 1.0935117, + "epoch": 0.397460561754521, + "flos": 487927411200.0, + "grad_norm": 0.08876156008903276, + "language_loss": 0.84441757, + "learning_rate": 0.0006857874048975088, + "loss": 0.85555011, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.19726562, + "step": 2066, + "time_per_iteration": 2.6363344192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102381, + "balance_loss_mlp": 1.08237755, + "epoch": 0.3976529434397845, + "flos": 421993525248.0, + "grad_norm": 0.06515627567230846, + "language_loss": 0.87180257, + "learning_rate": 0.0006854981320314142, + "loss": 0.88282633, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.19995117, + "step": 2067, + "time_per_iteration": 2.510763645172119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105961, + "balance_loss_mlp": 1.08644629, + "epoch": 0.3978453251250481, + "flos": 545589001728.0, + "grad_norm": 0.08362186096435482, + "language_loss": 0.86780995, + "learning_rate": 0.0006852087871485579, + "loss": 0.87886953, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.19506836, + "step": 2068, + "time_per_iteration": 2.653662919998169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106158, + "balance_loss_mlp": 1.08698964, + "epoch": 0.39803770681031164, + "flos": 650838592512.0, + "grad_norm": 0.09469661693362608, + "language_loss": 0.81769943, + "learning_rate": 0.0006849193703612735, + "loss": 0.82876104, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.19177246, + "step": 2069, + "time_per_iteration": 2.7798843383789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094976, + "balance_loss_mlp": 1.0750916, + "epoch": 0.39823008849557523, + "flos": 740072194560.0, + "grad_norm": 0.07513124412486355, + "language_loss": 0.77589542, + "learning_rate": 0.0006846298817819225, + "loss": 0.78684515, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.19873047, + "step": 2070, + "time_per_iteration": 2.984025716781616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094931, + "balance_loss_mlp": 1.07543969, + "epoch": 0.39842247018083876, + "flos": 385037452800.0, + "grad_norm": 0.07496601113124422, + "language_loss": 0.80744815, + "learning_rate": 0.0006843403215228945, + "loss": 0.8183974, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.19482422, + "step": 2071, + "time_per_iteration": 2.4528424739837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113518, + "balance_loss_mlp": 1.09400368, + "epoch": 0.39861485186610235, + "flos": 533696864256.0, + "grad_norm": 0.10952507549773222, + "language_loss": 0.80553752, + "learning_rate": 0.0006840506896966065, + "loss": 0.81667268, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.19519043, + "step": 2072, + "time_per_iteration": 2.7193689346313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113479, + "balance_loss_mlp": 1.09405994, + "epoch": 0.39880723355136594, + "flos": 643149227520.0, + "grad_norm": 0.07287911350271854, + "language_loss": 0.81897116, + "learning_rate": 0.0006837609864155038, + "loss": 0.8301059, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.1940918, + "step": 2073, + "time_per_iteration": 2.9260082244873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110629, + "balance_loss_mlp": 1.09179354, + "epoch": 0.39899961523662947, + "flos": 515847647232.0, + "grad_norm": 0.0731734663182413, + "language_loss": 0.83157325, + "learning_rate": 0.0006834712117920592, + "loss": 0.8426795, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.18823242, + "step": 2074, + "time_per_iteration": 2.629744052886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117154, + "balance_loss_mlp": 1.09769917, + "epoch": 0.39919199692189306, + "flos": 464385415680.0, + "grad_norm": 0.07643256719558747, + "language_loss": 0.85673088, + "learning_rate": 0.0006831813659387729, + "loss": 0.8679024, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.19433594, + "step": 2075, + "time_per_iteration": 2.5350148677825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116666, + "balance_loss_mlp": 1.0971514, + "epoch": 0.3993843786071566, + "flos": 531641180160.0, + "grad_norm": 0.07671111115245405, + "language_loss": 0.84214932, + "learning_rate": 0.0006828914489681733, + "loss": 0.85331595, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.19494629, + "step": 2076, + "time_per_iteration": 2.724330425262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125458, + "balance_loss_mlp": 1.10627747, + "epoch": 0.3995767602924202, + "flos": 503965421568.0, + "grad_norm": 0.08210563860740908, + "language_loss": 0.85224628, + "learning_rate": 0.0006826014609928162, + "loss": 0.86350089, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.19165039, + "step": 2077, + "time_per_iteration": 2.737734079360962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070244, + "balance_loss_mlp": 1.06070685, + "epoch": 0.3997691419776837, + "flos": 1454516600832.0, + "grad_norm": 0.03932449118700248, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.84269631, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.09521484, + "step": 2078, + "time_per_iteration": 4.887951612472534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124282, + "balance_loss_mlp": 1.10458827, + "epoch": 0.3999615236629473, + "flos": 530684918784.0, + "grad_norm": 0.09240147129054761, + "language_loss": 0.80077326, + "learning_rate": 0.0006820212724781896, + "loss": 0.81201607, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.19677734, + "step": 2079, + "time_per_iteration": 2.6855874061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114733, + "balance_loss_mlp": 1.09537315, + "epoch": 0.4001539053482108, + "flos": 695130522624.0, + "grad_norm": 0.0724055342629082, + "language_loss": 0.84239459, + "learning_rate": 0.0006817310721641694, + "loss": 0.85354191, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.19335938, + "step": 2080, + "time_per_iteration": 2.902536392211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122461, + "balance_loss_mlp": 1.10289896, + "epoch": 0.4003462870334744, + "flos": 520356939264.0, + "grad_norm": 0.0894692108770988, + "language_loss": 0.83972865, + "learning_rate": 0.00068144080129589, + "loss": 0.85095322, + "num_input_tokens_seen": 173430144, + "router_z_loss_mlp": 0.19543457, + "step": 2081, + "time_per_iteration": 2.613067865371704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122596, + "balance_loss_mlp": 1.1030333, + "epoch": 0.400538668718738, + "flos": 492518195712.0, + "grad_norm": 0.09472281695894083, + "language_loss": 0.82174724, + "learning_rate": 0.0006811504599860441, + "loss": 0.83297324, + "num_input_tokens_seen": 173494464, + "router_z_loss_mlp": 0.19555664, + "step": 2082, + "time_per_iteration": 2.6002771854400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111624, + "balance_loss_mlp": 1.09634447, + "epoch": 0.40073105040400153, + "flos": 490356052992.0, + "grad_norm": 0.06828551193852998, + "language_loss": 0.85353184, + "learning_rate": 0.0006808600483473526, + "loss": 0.86469424, + "num_input_tokens_seen": 173577168, + "router_z_loss_mlp": 0.19897461, + "step": 2083, + "time_per_iteration": 2.9010846614837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107422, + "balance_loss_mlp": 1.0885756, + "epoch": 0.4009234320892651, + "flos": 562378070016.0, + "grad_norm": 0.07802980838834611, + "language_loss": 0.8652671, + "learning_rate": 0.0006805695664925629, + "loss": 0.87634128, + "num_input_tokens_seen": 173655632, + "router_z_loss_mlp": 0.18823242, + "step": 2084, + "time_per_iteration": 2.8027803897857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111632, + "balance_loss_mlp": 1.0970912, + "epoch": 0.40111581377452865, + "flos": 425998808064.0, + "grad_norm": 0.08245020261724635, + "language_loss": 0.8423562, + "learning_rate": 0.0006802790145344506, + "loss": 0.85351944, + "num_input_tokens_seen": 173719040, + "router_z_loss_mlp": 0.19238281, + "step": 2085, + "time_per_iteration": 2.5397531986236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119142, + "balance_loss_mlp": 1.10039067, + "epoch": 0.40130819545979224, + "flos": 612446842368.0, + "grad_norm": 0.07508565386227965, + "language_loss": 0.87270218, + "learning_rate": 0.0006799883925858176, + "loss": 0.88389367, + "num_input_tokens_seen": 173796704, + "router_z_loss_mlp": 0.18737793, + "step": 2086, + "time_per_iteration": 2.876164197921753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112869, + "balance_loss_mlp": 1.10978329, + "epoch": 0.40150057714505577, + "flos": 523433124864.0, + "grad_norm": 0.07429159623595777, + "language_loss": 0.84809011, + "learning_rate": 0.0006796977007594933, + "loss": 0.85937703, + "num_input_tokens_seen": 173862352, + "router_z_loss_mlp": 0.18896484, + "step": 2087, + "time_per_iteration": 2.6302778720855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136639, + "balance_loss_mlp": 1.11681485, + "epoch": 0.40169295883031936, + "flos": 561424379904.0, + "grad_norm": 0.06510767025647884, + "language_loss": 0.86000383, + "learning_rate": 0.0006794069391683345, + "loss": 0.8713702, + "num_input_tokens_seen": 173935408, + "router_z_loss_mlp": 0.19824219, + "step": 2088, + "time_per_iteration": 2.7642226219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125568, + "balance_loss_mlp": 1.10582721, + "epoch": 0.4018853405155829, + "flos": 518997984768.0, + "grad_norm": 0.07763642733040174, + "language_loss": 0.80219448, + "learning_rate": 0.0006791161079252248, + "loss": 0.81345016, + "num_input_tokens_seen": 174007152, + "router_z_loss_mlp": 0.19726562, + "step": 2089, + "time_per_iteration": 2.6216719150543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112898, + "balance_loss_mlp": 1.10969198, + "epoch": 0.4020777222008465, + "flos": 526222614528.0, + "grad_norm": 0.06753993516242088, + "language_loss": 0.82396168, + "learning_rate": 0.0006788252071430747, + "loss": 0.83525145, + "num_input_tokens_seen": 174074976, + "router_z_loss_mlp": 0.19262695, + "step": 2090, + "time_per_iteration": 2.6881613731384277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136792, + "balance_loss_mlp": 1.11759949, + "epoch": 0.40227010388611006, + "flos": 525763021824.0, + "grad_norm": 0.07938192983074185, + "language_loss": 0.86496997, + "learning_rate": 0.0006785342369348222, + "loss": 0.87633789, + "num_input_tokens_seen": 174149392, + "router_z_loss_mlp": 0.19177246, + "step": 2091, + "time_per_iteration": 2.7187774181365967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134169, + "balance_loss_mlp": 1.11497617, + "epoch": 0.4024624855713736, + "flos": 432304252416.0, + "grad_norm": 0.08007566317284716, + "language_loss": 0.79674286, + "learning_rate": 0.0006782431974134316, + "loss": 0.80808461, + "num_input_tokens_seen": 174214656, + "router_z_loss_mlp": 0.19189453, + "step": 2092, + "time_per_iteration": 2.5497889518737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112292, + "balance_loss_mlp": 1.10301197, + "epoch": 0.4026548672566372, + "flos": 766660640256.0, + "grad_norm": 0.09546920549904063, + "language_loss": 0.89602369, + "learning_rate": 0.0006779520886918949, + "loss": 0.90725285, + "num_input_tokens_seen": 174296064, + "router_z_loss_mlp": 0.19897461, + "step": 2093, + "time_per_iteration": 3.070051431655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126954, + "balance_loss_mlp": 1.10783303, + "epoch": 0.4028472489419007, + "flos": 642931914240.0, + "grad_norm": 0.07932487566864904, + "language_loss": 0.81140947, + "learning_rate": 0.0006776609108832301, + "loss": 0.82267904, + "num_input_tokens_seen": 174370896, + "router_z_loss_mlp": 0.19116211, + "step": 2094, + "time_per_iteration": 2.8635079860687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117981, + "balance_loss_mlp": 1.09895563, + "epoch": 0.4030396306271643, + "flos": 491838718464.0, + "grad_norm": 0.08200776323916202, + "language_loss": 0.85093951, + "learning_rate": 0.0006773696641004828, + "loss": 0.86211932, + "num_input_tokens_seen": 174438448, + "router_z_loss_mlp": 0.19006348, + "step": 2095, + "time_per_iteration": 2.569387435913086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119321, + "balance_loss_mlp": 1.09972358, + "epoch": 0.40323201231242783, + "flos": 901728308736.0, + "grad_norm": 0.09231967023328698, + "language_loss": 0.77639973, + "learning_rate": 0.0006770783484567247, + "loss": 0.78759301, + "num_input_tokens_seen": 174525952, + "router_z_loss_mlp": 0.19592285, + "step": 2096, + "time_per_iteration": 3.1237080097198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109679, + "balance_loss_mlp": 1.09033108, + "epoch": 0.4034243939976914, + "flos": 570558961152.0, + "grad_norm": 0.07679281592908915, + "language_loss": 0.86043823, + "learning_rate": 0.000676786964065055, + "loss": 0.871535, + "num_input_tokens_seen": 174607200, + "router_z_loss_mlp": 0.1932373, + "step": 2097, + "time_per_iteration": 2.785017728805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.10181427, + "epoch": 0.403616775682955, + "flos": 507456783360.0, + "grad_norm": 0.07049509838223245, + "language_loss": 0.78567326, + "learning_rate": 0.0006764955110385986, + "loss": 0.79688537, + "num_input_tokens_seen": 174680976, + "router_z_loss_mlp": 0.19384766, + "step": 2098, + "time_per_iteration": 2.7599899768829346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011178, + "balance_loss_mlp": 1.09878576, + "epoch": 0.40380915736821854, + "flos": 519383425536.0, + "grad_norm": 0.07587511524565468, + "language_loss": 0.8025918, + "learning_rate": 0.0006762039894905083, + "loss": 0.81376982, + "num_input_tokens_seen": 174753152, + "router_z_loss_mlp": 0.19006348, + "step": 2099, + "time_per_iteration": 2.6616034507751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115132, + "balance_loss_mlp": 1.09635651, + "epoch": 0.40400153905348213, + "flos": 441925590528.0, + "grad_norm": 0.08446355623188201, + "language_loss": 0.80088019, + "learning_rate": 0.000675912399533962, + "loss": 0.81203151, + "num_input_tokens_seen": 174817184, + "router_z_loss_mlp": 0.1875, + "step": 2100, + "time_per_iteration": 2.53584885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129908, + "balance_loss_mlp": 1.1112045, + "epoch": 0.40419392073874566, + "flos": 772309002240.0, + "grad_norm": 0.057425192194628195, + "language_loss": 0.84893382, + "learning_rate": 0.0006756207412821656, + "loss": 0.86023289, + "num_input_tokens_seen": 174898128, + "router_z_loss_mlp": 0.18701172, + "step": 2101, + "time_per_iteration": 3.0146372318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133046, + "balance_loss_mlp": 1.11444974, + "epoch": 0.40438630242400925, + "flos": 766569235968.0, + "grad_norm": 0.08385244443422216, + "language_loss": 0.79946959, + "learning_rate": 0.0006753290148483505, + "loss": 0.81080002, + "num_input_tokens_seen": 174981872, + "router_z_loss_mlp": 0.18603516, + "step": 2102, + "time_per_iteration": 3.1141843795776367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131753, + "balance_loss_mlp": 1.11306119, + "epoch": 0.4045786841092728, + "flos": 415235828736.0, + "grad_norm": 0.10321495678621663, + "language_loss": 0.7855078, + "learning_rate": 0.0006750372203457752, + "loss": 0.79682529, + "num_input_tokens_seen": 175044976, + "router_z_loss_mlp": 0.18688965, + "step": 2103, + "time_per_iteration": 2.5273704528808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133211, + "balance_loss_mlp": 1.1144712, + "epoch": 0.40477106579453637, + "flos": 539214174720.0, + "grad_norm": 0.06897182936898366, + "language_loss": 0.86569643, + "learning_rate": 0.0006747453578877242, + "loss": 0.87702858, + "num_input_tokens_seen": 175121104, + "router_z_loss_mlp": 0.18725586, + "step": 2104, + "time_per_iteration": 2.7731292247772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136147, + "balance_loss_mlp": 1.11752641, + "epoch": 0.4049634474797999, + "flos": 826704258048.0, + "grad_norm": 0.08357448735589112, + "language_loss": 0.82917869, + "learning_rate": 0.0006744534275875085, + "loss": 0.84054017, + "num_input_tokens_seen": 175194512, + "router_z_loss_mlp": 0.1862793, + "step": 2105, + "time_per_iteration": 3.0466742515563965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148022, + "balance_loss_mlp": 1.12974763, + "epoch": 0.4051558291650635, + "flos": 572684027904.0, + "grad_norm": 0.09276188373090515, + "language_loss": 0.85562009, + "learning_rate": 0.0006741614295584657, + "loss": 0.8671003, + "num_input_tokens_seen": 175264176, + "router_z_loss_mlp": 0.18273926, + "step": 2106, + "time_per_iteration": 2.678776264190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115565, + "balance_loss_mlp": 1.13704157, + "epoch": 0.4053482108503271, + "flos": 731881391616.0, + "grad_norm": 0.0813184956351506, + "language_loss": 0.78235412, + "learning_rate": 0.0006738693639139595, + "loss": 0.79391062, + "num_input_tokens_seen": 175347488, + "router_z_loss_mlp": 0.18603516, + "step": 2107, + "time_per_iteration": 3.0155587196350098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157609, + "balance_loss_mlp": 1.13920343, + "epoch": 0.4055405925355906, + "flos": 1213059193344.0, + "grad_norm": 0.09421684944263367, + "language_loss": 0.77232802, + "learning_rate": 0.0006735772307673796, + "loss": 0.78390408, + "num_input_tokens_seen": 175438336, + "router_z_loss_mlp": 0.18408203, + "step": 2108, + "time_per_iteration": 3.586928129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01165053, + "balance_loss_mlp": 1.14651608, + "epoch": 0.4057329742208542, + "flos": 715863204864.0, + "grad_norm": 0.06861239024528153, + "language_loss": 0.83003211, + "learning_rate": 0.0006732850302321421, + "loss": 0.84168267, + "num_input_tokens_seen": 175510912, + "router_z_loss_mlp": 0.18518066, + "step": 2109, + "time_per_iteration": 2.9429726600646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160041, + "balance_loss_mlp": 1.14086008, + "epoch": 0.4059253559061177, + "flos": 564888577536.0, + "grad_norm": 0.07515968908819307, + "language_loss": 0.84144229, + "learning_rate": 0.00067299276242169, + "loss": 0.85304272, + "num_input_tokens_seen": 175583040, + "router_z_loss_mlp": 0.19177246, + "step": 2110, + "time_per_iteration": 2.6710071563720703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044281, + "balance_loss_mlp": 1.03436232, + "epoch": 0.4061177375913813, + "flos": 1593744450048.0, + "grad_norm": 0.023257265358085616, + "language_loss": 0.74382168, + "learning_rate": 0.0006727004274494908, + "loss": 0.75426447, + "num_input_tokens_seen": 175817952, + "router_z_loss_mlp": 0.09912109, + "step": 2111, + "time_per_iteration": 4.914813756942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151064, + "balance_loss_mlp": 1.13221717, + "epoch": 0.40631011927664484, + "flos": 615421711872.0, + "grad_norm": 0.09830411974127871, + "language_loss": 0.77889705, + "learning_rate": 0.0006724080254290395, + "loss": 0.79040766, + "num_input_tokens_seen": 175896352, + "router_z_loss_mlp": 0.18811035, + "step": 2112, + "time_per_iteration": 2.8067259788513184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136038, + "balance_loss_mlp": 1.11665511, + "epoch": 0.40650250096190843, + "flos": 557661376512.0, + "grad_norm": 0.07964969066506762, + "language_loss": 0.89744002, + "learning_rate": 0.0006721155564738566, + "loss": 0.90880042, + "num_input_tokens_seen": 175967152, + "router_z_loss_mlp": 0.19360352, + "step": 2113, + "time_per_iteration": 2.7009260654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105786, + "balance_loss_mlp": 1.04798985, + "epoch": 0.40669488264717196, + "flos": 1580147564544.0, + "grad_norm": 0.033284036056789104, + "language_loss": 0.78622639, + "learning_rate": 0.0006718230206974884, + "loss": 0.79680502, + "num_input_tokens_seen": 176205248, + "router_z_loss_mlp": 0.09863281, + "step": 2114, + "time_per_iteration": 4.983005523681641 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127405, + "balance_loss_mlp": 1.10823643, + "epoch": 0.40688726433243555, + "flos": 507649503744.0, + "grad_norm": 0.07850906735960049, + "language_loss": 0.85233408, + "learning_rate": 0.0006715304182135078, + "loss": 0.86360812, + "num_input_tokens_seen": 176276208, + "router_z_loss_mlp": 0.19152832, + "step": 2115, + "time_per_iteration": 2.6078672409057617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114305, + "balance_loss_mlp": 1.09480286, + "epoch": 0.40707964601769914, + "flos": 589075172352.0, + "grad_norm": 0.063032684383759, + "language_loss": 0.88685012, + "learning_rate": 0.0006712377491355127, + "loss": 0.89799315, + "num_input_tokens_seen": 176355072, + "router_z_loss_mlp": 0.19482422, + "step": 2116, + "time_per_iteration": 2.8919928073883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011132, + "balance_loss_mlp": 1.09403157, + "epoch": 0.40727202770296267, + "flos": 580437259776.0, + "grad_norm": 0.07591389839440288, + "language_loss": 0.81216896, + "learning_rate": 0.0006709450135771274, + "loss": 0.82330096, + "num_input_tokens_seen": 176444592, + "router_z_loss_mlp": 0.19152832, + "step": 2117, + "time_per_iteration": 2.948209524154663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110152, + "balance_loss_mlp": 1.09097123, + "epoch": 0.40746440938822626, + "flos": 504076649472.0, + "grad_norm": 0.0664106663118444, + "language_loss": 0.86270058, + "learning_rate": 0.0006706522116520023, + "loss": 0.87380207, + "num_input_tokens_seen": 176516144, + "router_z_loss_mlp": 0.19177246, + "step": 2118, + "time_per_iteration": 2.63297963142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109766, + "balance_loss_mlp": 1.09078836, + "epoch": 0.4076567910734898, + "flos": 605600312832.0, + "grad_norm": 0.08309315753094405, + "language_loss": 0.82646739, + "learning_rate": 0.0006703593434738127, + "loss": 0.83756506, + "num_input_tokens_seen": 176585712, + "router_z_loss_mlp": 0.18969727, + "step": 2119, + "time_per_iteration": 2.7504334449768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110305, + "balance_loss_mlp": 1.08339202, + "epoch": 0.4078491727587534, + "flos": 479553799680.0, + "grad_norm": 0.06315918122435989, + "language_loss": 0.78157568, + "learning_rate": 0.0006700664091562604, + "loss": 0.79260623, + "num_input_tokens_seen": 176654736, + "router_z_loss_mlp": 0.1965332, + "step": 2120, + "time_per_iteration": 2.5809123516082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109509, + "balance_loss_mlp": 1.08968461, + "epoch": 0.4080415544440169, + "flos": 510384665088.0, + "grad_norm": 0.06251573302429693, + "language_loss": 0.84974718, + "learning_rate": 0.0006697734088130725, + "loss": 0.86084229, + "num_input_tokens_seen": 176722800, + "router_z_loss_mlp": 0.19812012, + "step": 2121, + "time_per_iteration": 2.6444742679595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103604, + "balance_loss_mlp": 1.08350492, + "epoch": 0.4082339361292805, + "flos": 734638947840.0, + "grad_norm": 0.08444724355881765, + "language_loss": 0.85282058, + "learning_rate": 0.0006694803425580018, + "loss": 0.86385661, + "num_input_tokens_seen": 176800320, + "router_z_loss_mlp": 0.20080566, + "step": 2122, + "time_per_iteration": 2.9844353199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101862, + "balance_loss_mlp": 1.08126235, + "epoch": 0.4084263178145441, + "flos": 457472074752.0, + "grad_norm": 0.08120556309716129, + "language_loss": 0.84838599, + "learning_rate": 0.0006691872105048268, + "loss": 0.85940456, + "num_input_tokens_seen": 176867440, + "router_z_loss_mlp": 0.20605469, + "step": 2123, + "time_per_iteration": 2.587648868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104271, + "balance_loss_mlp": 1.08323061, + "epoch": 0.4086186994998076, + "flos": 562931638272.0, + "grad_norm": 0.07277240915985977, + "language_loss": 0.84579539, + "learning_rate": 0.0006688940127673513, + "loss": 0.85683805, + "num_input_tokens_seen": 176942048, + "router_z_loss_mlp": 0.21044922, + "step": 2124, + "time_per_iteration": 2.6976451873779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108248, + "balance_loss_mlp": 1.08663535, + "epoch": 0.4088110811850712, + "flos": 573669651456.0, + "grad_norm": 0.07888289921071225, + "language_loss": 0.85375637, + "learning_rate": 0.0006686007494594049, + "loss": 0.86483884, + "num_input_tokens_seen": 177025104, + "router_z_loss_mlp": 0.21618652, + "step": 2125, + "time_per_iteration": 2.842721700668335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109657, + "balance_loss_mlp": 1.075279, + "epoch": 0.40900346287033473, + "flos": 456930989568.0, + "grad_norm": 0.1494487487543463, + "language_loss": 0.80707026, + "learning_rate": 0.0006683074206948425, + "loss": 0.81803596, + "num_input_tokens_seen": 177089296, + "router_z_loss_mlp": 0.21289062, + "step": 2126, + "time_per_iteration": 2.54156231880188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088684, + "balance_loss_mlp": 1.06790602, + "epoch": 0.4091958445555983, + "flos": 617395903488.0, + "grad_norm": 0.07127639192135228, + "language_loss": 0.81315231, + "learning_rate": 0.0006680140265875443, + "loss": 0.82403916, + "num_input_tokens_seen": 177163648, + "router_z_loss_mlp": 0.20788574, + "step": 2127, + "time_per_iteration": 2.8282980918884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093135, + "balance_loss_mlp": 1.07241678, + "epoch": 0.40938822624086185, + "flos": 472400750592.0, + "grad_norm": 0.07736719826860473, + "language_loss": 0.953547, + "learning_rate": 0.0006677205672514162, + "loss": 0.96447837, + "num_input_tokens_seen": 177233856, + "router_z_loss_mlp": 0.20715332, + "step": 2128, + "time_per_iteration": 2.635601758956909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089263, + "balance_loss_mlp": 1.06965339, + "epoch": 0.40958060792612544, + "flos": 570010535424.0, + "grad_norm": 0.07314070036202396, + "language_loss": 0.88630438, + "learning_rate": 0.000667427042800389, + "loss": 0.89719707, + "num_input_tokens_seen": 177309824, + "router_z_loss_mlp": 0.19604492, + "step": 2129, + "time_per_iteration": 2.792956829071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094452, + "balance_loss_mlp": 1.07447219, + "epoch": 0.40977298961138897, + "flos": 609346063872.0, + "grad_norm": 0.07258896862524182, + "language_loss": 0.82793128, + "learning_rate": 0.0006671334533484192, + "loss": 0.83887583, + "num_input_tokens_seen": 177380592, + "router_z_loss_mlp": 0.19970703, + "step": 2130, + "time_per_iteration": 2.773900270462036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07694483, + "epoch": 0.40996537129665256, + "flos": 581744457216.0, + "grad_norm": 0.07325583153216161, + "language_loss": 0.83178955, + "learning_rate": 0.0006668397990094881, + "loss": 0.84274781, + "num_input_tokens_seen": 177454720, + "router_z_loss_mlp": 0.1887207, + "step": 2131, + "time_per_iteration": 2.752606153488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_mlp": 1.08409071, + "epoch": 0.41015775298191615, + "flos": 516546948096.0, + "grad_norm": 0.08072513277707091, + "language_loss": 0.84810466, + "learning_rate": 0.0006665460798976027, + "loss": 0.85913295, + "num_input_tokens_seen": 177528224, + "router_z_loss_mlp": 0.18725586, + "step": 2132, + "time_per_iteration": 2.7918195724487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101646, + "balance_loss_mlp": 1.08277488, + "epoch": 0.4103501346671797, + "flos": 510354929664.0, + "grad_norm": 0.057661652953568024, + "language_loss": 0.8113941, + "learning_rate": 0.0006662522961267947, + "loss": 0.82241058, + "num_input_tokens_seen": 177598176, + "router_z_loss_mlp": 0.18859863, + "step": 2133, + "time_per_iteration": 2.7084174156188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114188, + "balance_loss_mlp": 1.09586525, + "epoch": 0.41054251635244327, + "flos": 549752126976.0, + "grad_norm": 0.07117823449693282, + "language_loss": 0.86957145, + "learning_rate": 0.0006659584478111211, + "loss": 0.88071334, + "num_input_tokens_seen": 177675840, + "router_z_loss_mlp": 0.18322754, + "step": 2134, + "time_per_iteration": 2.8745734691619873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120532, + "balance_loss_mlp": 1.10234094, + "epoch": 0.4107348980377068, + "flos": 839898450432.0, + "grad_norm": 0.10436544040673855, + "language_loss": 0.82673836, + "learning_rate": 0.000665664535064664, + "loss": 0.83794367, + "num_input_tokens_seen": 177751376, + "router_z_loss_mlp": 0.1817627, + "step": 2135, + "time_per_iteration": 3.0361244678497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120758, + "balance_loss_mlp": 1.10167265, + "epoch": 0.4109272797229704, + "flos": 503708461056.0, + "grad_norm": 0.07372821186051039, + "language_loss": 0.82676935, + "learning_rate": 0.0006653705580015303, + "loss": 0.83797693, + "num_input_tokens_seen": 177825264, + "router_z_loss_mlp": 0.1907959, + "step": 2136, + "time_per_iteration": 2.6784329414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121286, + "balance_loss_mlp": 1.10184264, + "epoch": 0.4111196614082339, + "flos": 610830927360.0, + "grad_norm": 0.08099943161450797, + "language_loss": 0.8610462, + "learning_rate": 0.0006650765167358523, + "loss": 0.87225902, + "num_input_tokens_seen": 177901680, + "router_z_loss_mlp": 0.19421387, + "step": 2137, + "time_per_iteration": 2.8350300788879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113363, + "balance_loss_mlp": 1.09431374, + "epoch": 0.4113120430934975, + "flos": 453165414912.0, + "grad_norm": 0.09328592607957716, + "language_loss": 0.89696336, + "learning_rate": 0.0006647824113817864, + "loss": 0.90809703, + "num_input_tokens_seen": 177965264, + "router_z_loss_mlp": 0.19030762, + "step": 2138, + "time_per_iteration": 2.5345799922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112598, + "balance_loss_mlp": 1.09391761, + "epoch": 0.41150442477876104, + "flos": 541600971264.0, + "grad_norm": 0.24980936370747706, + "language_loss": 0.81674927, + "learning_rate": 0.000664488242053515, + "loss": 0.82787526, + "num_input_tokens_seen": 178039712, + "router_z_loss_mlp": 0.18688965, + "step": 2139, + "time_per_iteration": 2.729074716567993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112354, + "balance_loss_mlp": 1.09430587, + "epoch": 0.4116968064640246, + "flos": 576291386880.0, + "grad_norm": 0.06520257719296937, + "language_loss": 0.8372556, + "learning_rate": 0.0006641940088652445, + "loss": 0.84837914, + "num_input_tokens_seen": 178114080, + "router_z_loss_mlp": 0.18054199, + "step": 2140, + "time_per_iteration": 2.822861909866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114304, + "balance_loss_mlp": 1.09476542, + "epoch": 0.4118891881492882, + "flos": 496115642880.0, + "grad_norm": 0.09690666410410188, + "language_loss": 0.82505018, + "learning_rate": 0.0006638997119312065, + "loss": 0.8361932, + "num_input_tokens_seen": 178188032, + "router_z_loss_mlp": 0.1953125, + "step": 2141, + "time_per_iteration": 2.7164361476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081351, + "balance_loss_mlp": 1.0707655, + "epoch": 0.41208156983455174, + "flos": 1538395877376.0, + "grad_norm": 0.03550975461959617, + "language_loss": 0.75063306, + "learning_rate": 0.0006636053513656568, + "loss": 0.7614466, + "num_input_tokens_seen": 178395328, + "router_z_loss_mlp": 0.10595703, + "step": 2142, + "time_per_iteration": 4.951165437698364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116466, + "balance_loss_mlp": 1.09734452, + "epoch": 0.41227395151981533, + "flos": 584968946688.0, + "grad_norm": 0.10349541439789608, + "language_loss": 0.8488189, + "learning_rate": 0.000663310927282877, + "loss": 0.8599835, + "num_input_tokens_seen": 178471952, + "router_z_loss_mlp": 0.19116211, + "step": 2143, + "time_per_iteration": 2.834325075149536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123685, + "balance_loss_mlp": 1.10346723, + "epoch": 0.41246633320507886, + "flos": 442926268416.0, + "grad_norm": 0.07414481576642443, + "language_loss": 0.85735166, + "learning_rate": 0.000663016439797172, + "loss": 0.86858845, + "num_input_tokens_seen": 178542192, + "router_z_loss_mlp": 0.20214844, + "step": 2144, + "time_per_iteration": 2.641390800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118144, + "balance_loss_mlp": 1.09814095, + "epoch": 0.41265871489034245, + "flos": 579962985984.0, + "grad_norm": 0.07853696289984005, + "language_loss": 0.80941319, + "learning_rate": 0.0006627218890228724, + "loss": 0.82059467, + "num_input_tokens_seen": 178622736, + "router_z_loss_mlp": 0.20007324, + "step": 2145, + "time_per_iteration": 2.7847142219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115703, + "balance_loss_mlp": 1.0958544, + "epoch": 0.412851096575606, + "flos": 761229964800.0, + "grad_norm": 0.07518431098775835, + "language_loss": 0.83727562, + "learning_rate": 0.0006624272750743326, + "loss": 0.84843272, + "num_input_tokens_seen": 178705808, + "router_z_loss_mlp": 0.19836426, + "step": 2146, + "time_per_iteration": 3.0317938327789307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117359, + "balance_loss_mlp": 1.09733224, + "epoch": 0.41304347826086957, + "flos": 555353501184.0, + "grad_norm": 0.06462993006694184, + "language_loss": 0.8283999, + "learning_rate": 0.0006621325980659322, + "loss": 0.83957344, + "num_input_tokens_seen": 178781200, + "router_z_loss_mlp": 0.20019531, + "step": 2147, + "time_per_iteration": 2.786724328994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118, + "balance_loss_mlp": 1.0978415, + "epoch": 0.41323585994613315, + "flos": 665712940032.0, + "grad_norm": 0.10640671392978962, + "language_loss": 0.81600213, + "learning_rate": 0.000661837858112075, + "loss": 0.82718211, + "num_input_tokens_seen": 178855072, + "router_z_loss_mlp": 0.20153809, + "step": 2148, + "time_per_iteration": 2.854837417602539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115633, + "balance_loss_mlp": 1.09577227, + "epoch": 0.4134282416313967, + "flos": 548699692032.0, + "grad_norm": 0.06752887879335369, + "language_loss": 0.88443303, + "learning_rate": 0.0006615430553271888, + "loss": 0.89558935, + "num_input_tokens_seen": 178927936, + "router_z_loss_mlp": 0.19848633, + "step": 2149, + "time_per_iteration": 2.8243539333343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115349, + "balance_loss_mlp": 1.09486902, + "epoch": 0.4136206233166603, + "flos": 646262489088.0, + "grad_norm": 0.06757702274708675, + "language_loss": 0.85010874, + "learning_rate": 0.0006612481898257264, + "loss": 0.8612622, + "num_input_tokens_seen": 179007792, + "router_z_loss_mlp": 0.20483398, + "step": 2150, + "time_per_iteration": 2.870486259460449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114158, + "balance_loss_mlp": 1.09377337, + "epoch": 0.4138130050019238, + "flos": 517354905600.0, + "grad_norm": 0.08316851802653256, + "language_loss": 0.85005617, + "learning_rate": 0.000660953261722165, + "loss": 0.86119783, + "num_input_tokens_seen": 179075200, + "router_z_loss_mlp": 0.20385742, + "step": 2151, + "time_per_iteration": 2.6056485176086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112582, + "balance_loss_mlp": 1.09265018, + "epoch": 0.4140053866871874, + "flos": 609254659584.0, + "grad_norm": 0.06870221870710541, + "language_loss": 0.82367688, + "learning_rate": 0.0006606582711310055, + "loss": 0.83480269, + "num_input_tokens_seen": 179144448, + "router_z_loss_mlp": 0.19934082, + "step": 2152, + "time_per_iteration": 2.7264139652252197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119446, + "balance_loss_mlp": 1.09854901, + "epoch": 0.4141977683724509, + "flos": 579762925056.0, + "grad_norm": 0.0720639200532027, + "language_loss": 0.83059323, + "learning_rate": 0.0006603632181667736, + "loss": 0.8417877, + "num_input_tokens_seen": 179215776, + "router_z_loss_mlp": 0.20910645, + "step": 2153, + "time_per_iteration": 2.6930761337280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01055226, + "balance_loss_mlp": 1.04149318, + "epoch": 0.4143901500577145, + "flos": 1307312317440.0, + "grad_norm": 0.029268536031501605, + "language_loss": 0.78943324, + "learning_rate": 0.0006600681029440187, + "loss": 0.79998553, + "num_input_tokens_seen": 179436688, + "router_z_loss_mlp": 0.13769531, + "step": 2154, + "time_per_iteration": 4.951904773712158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133368, + "balance_loss_mlp": 1.11335301, + "epoch": 0.41458253174297804, + "flos": 460189983744.0, + "grad_norm": 0.08213185756435645, + "language_loss": 0.81797659, + "learning_rate": 0.0006597729255773153, + "loss": 0.82931024, + "num_input_tokens_seen": 179503264, + "router_z_loss_mlp": 0.20007324, + "step": 2155, + "time_per_iteration": 2.6153218746185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142819, + "balance_loss_mlp": 1.1224227, + "epoch": 0.41477491342824163, + "flos": 553364628480.0, + "grad_norm": 0.0847752552783981, + "language_loss": 0.82203597, + "learning_rate": 0.0006594776861812608, + "loss": 0.83346415, + "num_input_tokens_seen": 179574864, + "router_z_loss_mlp": 0.20397949, + "step": 2156, + "time_per_iteration": 2.68922758102417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153838, + "balance_loss_mlp": 1.13410926, + "epoch": 0.4149672951135052, + "flos": 697771708416.0, + "grad_norm": 0.06809079383741527, + "language_loss": 0.86262864, + "learning_rate": 0.0006591823848704776, + "loss": 0.87416703, + "num_input_tokens_seen": 179658208, + "router_z_loss_mlp": 0.1973877, + "step": 2157, + "time_per_iteration": 2.9523754119873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147297, + "balance_loss_mlp": 1.12693584, + "epoch": 0.41515967679876875, + "flos": 565750863360.0, + "grad_norm": 0.07690135227418383, + "language_loss": 0.81358635, + "learning_rate": 0.0006588870217596117, + "loss": 0.82505929, + "num_input_tokens_seen": 179732320, + "router_z_loss_mlp": 0.20361328, + "step": 2158, + "time_per_iteration": 2.7730822563171387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140876, + "balance_loss_mlp": 1.12146926, + "epoch": 0.41535205848403234, + "flos": 501185843712.0, + "grad_norm": 0.08370852265526307, + "language_loss": 0.857876, + "learning_rate": 0.0006585915969633334, + "loss": 0.86928475, + "num_input_tokens_seen": 179801616, + "router_z_loss_mlp": 0.19396973, + "step": 2159, + "time_per_iteration": 2.6628706455230713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133859, + "balance_loss_mlp": 1.1143918, + "epoch": 0.41554444016929587, + "flos": 607554680832.0, + "grad_norm": 0.07868666241976846, + "language_loss": 0.8926276, + "learning_rate": 0.0006582961105963366, + "loss": 0.90396619, + "num_input_tokens_seen": 179876112, + "router_z_loss_mlp": 0.19445801, + "step": 2160, + "time_per_iteration": 2.856227397918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126433, + "balance_loss_mlp": 1.10702562, + "epoch": 0.41573682185455946, + "flos": 529115991552.0, + "grad_norm": 0.10110909063497833, + "language_loss": 0.77701914, + "learning_rate": 0.0006580005627733395, + "loss": 0.78828347, + "num_input_tokens_seen": 179949936, + "router_z_loss_mlp": 0.19396973, + "step": 2161, + "time_per_iteration": 2.763690948486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131659, + "balance_loss_mlp": 1.11281204, + "epoch": 0.415929203539823, + "flos": 504956187648.0, + "grad_norm": 0.0788483903527846, + "language_loss": 0.81671721, + "learning_rate": 0.0006577049536090838, + "loss": 0.8280338, + "num_input_tokens_seen": 180023184, + "router_z_loss_mlp": 0.18823242, + "step": 2162, + "time_per_iteration": 2.7156083583831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130408, + "balance_loss_mlp": 1.11114359, + "epoch": 0.4161215852250866, + "flos": 582737794560.0, + "grad_norm": 0.08609543464950487, + "language_loss": 0.85536218, + "learning_rate": 0.000657409283218335, + "loss": 0.8666662, + "num_input_tokens_seen": 180091728, + "router_z_loss_mlp": 0.19250488, + "step": 2163, + "time_per_iteration": 2.711332082748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135342, + "balance_loss_mlp": 1.11707878, + "epoch": 0.4163139669103501, + "flos": 490697077248.0, + "grad_norm": 0.08463355465100361, + "language_loss": 0.81072271, + "learning_rate": 0.0006571135517158829, + "loss": 0.82207608, + "num_input_tokens_seen": 180162096, + "router_z_loss_mlp": 0.18273926, + "step": 2164, + "time_per_iteration": 2.6715452671051025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_mlp": 1.01865911, + "epoch": 0.4165063485956137, + "flos": 1288158474240.0, + "grad_norm": 0.01758070932569607, + "language_loss": 0.76764059, + "learning_rate": 0.0006568177592165404, + "loss": 0.77793115, + "num_input_tokens_seen": 180380912, + "router_z_loss_mlp": 0.10400391, + "step": 2165, + "time_per_iteration": 4.765650272369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154419, + "balance_loss_mlp": 1.13588202, + "epoch": 0.4166987302808773, + "flos": 495263268864.0, + "grad_norm": 0.09117992314911788, + "language_loss": 0.828076, + "learning_rate": 0.0006565219058351444, + "loss": 0.83962023, + "num_input_tokens_seen": 180447424, + "router_z_loss_mlp": 0.18530273, + "step": 2166, + "time_per_iteration": 2.568162202835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160705, + "balance_loss_mlp": 1.14153659, + "epoch": 0.4168911119661408, + "flos": 464071555584.0, + "grad_norm": 0.1435965153845973, + "language_loss": 0.82720423, + "learning_rate": 0.0006562259916865553, + "loss": 0.83881128, + "num_input_tokens_seen": 180516336, + "router_z_loss_mlp": 0.19165039, + "step": 2167, + "time_per_iteration": 2.577831506729126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146961, + "balance_loss_mlp": 1.12813759, + "epoch": 0.4170834936514044, + "flos": 536787730944.0, + "grad_norm": 0.10197305761412122, + "language_loss": 0.79348731, + "learning_rate": 0.0006559300168856573, + "loss": 0.80495691, + "num_input_tokens_seen": 180589824, + "router_z_loss_mlp": 0.18798828, + "step": 2168, + "time_per_iteration": 2.7849843502044678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143742, + "balance_loss_mlp": 1.12485933, + "epoch": 0.41727587533666793, + "flos": 550683795456.0, + "grad_norm": 0.07754195288754885, + "language_loss": 0.86023396, + "learning_rate": 0.0006556339815473577, + "loss": 0.87167138, + "num_input_tokens_seen": 180661296, + "router_z_loss_mlp": 0.1887207, + "step": 2169, + "time_per_iteration": 2.7085328102111816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142677, + "balance_loss_mlp": 1.12390125, + "epoch": 0.4174682570219315, + "flos": 631111357440.0, + "grad_norm": 0.08981224380419678, + "language_loss": 0.86090291, + "learning_rate": 0.000655337885786588, + "loss": 0.87232965, + "num_input_tokens_seen": 180744896, + "router_z_loss_mlp": 0.1875, + "step": 2170, + "time_per_iteration": 2.9244213104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128382, + "balance_loss_mlp": 1.10963011, + "epoch": 0.41766063870719505, + "flos": 519751613952.0, + "grad_norm": 0.08419137591764536, + "language_loss": 0.8483454, + "learning_rate": 0.0006550417297183025, + "loss": 0.85962915, + "num_input_tokens_seen": 180813008, + "router_z_loss_mlp": 0.18737793, + "step": 2171, + "time_per_iteration": 2.6424126625061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116149, + "balance_loss_mlp": 1.09746861, + "epoch": 0.41785302039245864, + "flos": 557935589376.0, + "grad_norm": 0.07276027667818112, + "language_loss": 0.81700563, + "learning_rate": 0.0006547455134574793, + "loss": 0.82816714, + "num_input_tokens_seen": 180886480, + "router_z_loss_mlp": 0.18664551, + "step": 2172, + "time_per_iteration": 2.743807315826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118053, + "balance_loss_mlp": 1.09920597, + "epoch": 0.41804540207772223, + "flos": 788529821184.0, + "grad_norm": 0.06582530373346562, + "language_loss": 0.83907378, + "learning_rate": 0.0006544492371191198, + "loss": 0.85025424, + "num_input_tokens_seen": 180973776, + "router_z_loss_mlp": 0.18847656, + "step": 2173, + "time_per_iteration": 3.1398048400878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112075, + "balance_loss_mlp": 1.09203625, + "epoch": 0.41823778376298576, + "flos": 904332418560.0, + "grad_norm": 0.07927924785081189, + "language_loss": 0.83028531, + "learning_rate": 0.0006541529008182485, + "loss": 0.84140611, + "num_input_tokens_seen": 181062768, + "router_z_loss_mlp": 0.20031738, + "step": 2174, + "time_per_iteration": 3.218675136566162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113128, + "balance_loss_mlp": 1.09423363, + "epoch": 0.41843016544824935, + "flos": 511560811008.0, + "grad_norm": 0.08063752220274202, + "language_loss": 0.87068301, + "learning_rate": 0.0006538565046699136, + "loss": 0.88181424, + "num_input_tokens_seen": 181129872, + "router_z_loss_mlp": 0.18884277, + "step": 2175, + "time_per_iteration": 2.623373031616211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110698, + "balance_loss_mlp": 1.09179151, + "epoch": 0.4186225471335129, + "flos": 653077085184.0, + "grad_norm": 0.10224918928766584, + "language_loss": 0.80967259, + "learning_rate": 0.0006535600487891862, + "loss": 0.82077956, + "num_input_tokens_seen": 181208112, + "router_z_loss_mlp": 0.18896484, + "step": 2176, + "time_per_iteration": 2.858027935028076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108436, + "balance_loss_mlp": 1.08948123, + "epoch": 0.41881492881877647, + "flos": 569158161408.0, + "grad_norm": 0.0620502143296578, + "language_loss": 0.88827038, + "learning_rate": 0.0006532635332911603, + "loss": 0.89935476, + "num_input_tokens_seen": 181278736, + "router_z_loss_mlp": 0.1895752, + "step": 2177, + "time_per_iteration": 2.6979219913482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126272, + "balance_loss_mlp": 1.10786629, + "epoch": 0.41900731050404, + "flos": 911878248960.0, + "grad_norm": 0.06643064450406437, + "language_loss": 0.80475914, + "learning_rate": 0.0006529669582909541, + "loss": 0.81602192, + "num_input_tokens_seen": 181362512, + "router_z_loss_mlp": 0.18408203, + "step": 2178, + "time_per_iteration": 3.246621608734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112675, + "balance_loss_mlp": 1.10820079, + "epoch": 0.4191996921893036, + "flos": 535755119616.0, + "grad_norm": 0.08441696273800357, + "language_loss": 0.85626066, + "learning_rate": 0.0006526703239037077, + "loss": 0.8675282, + "num_input_tokens_seen": 181432080, + "router_z_loss_mlp": 0.1854248, + "step": 2179, + "time_per_iteration": 2.67114520072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126772, + "balance_loss_mlp": 1.10779428, + "epoch": 0.4193920738745671, + "flos": 582636478464.0, + "grad_norm": 0.07577304920294069, + "language_loss": 0.86212498, + "learning_rate": 0.0006523736302445851, + "loss": 0.8733927, + "num_input_tokens_seen": 181507296, + "router_z_loss_mlp": 0.18969727, + "step": 2180, + "time_per_iteration": 2.7883896827697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132192, + "balance_loss_mlp": 1.11371422, + "epoch": 0.4195844555598307, + "flos": 1335782472192.0, + "grad_norm": 0.08559665169482955, + "language_loss": 0.77047896, + "learning_rate": 0.0006520768774287728, + "loss": 0.78180093, + "num_input_tokens_seen": 181599408, + "router_z_loss_mlp": 0.18469238, + "step": 2181, + "time_per_iteration": 3.777104616165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127516, + "balance_loss_mlp": 1.10862184, + "epoch": 0.4197768372450943, + "flos": 598783145472.0, + "grad_norm": 0.06395892384144108, + "language_loss": 0.85356331, + "learning_rate": 0.0006517800655714806, + "loss": 0.86483848, + "num_input_tokens_seen": 181674944, + "router_z_loss_mlp": 0.18884277, + "step": 2182, + "time_per_iteration": 2.8449056148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116562, + "balance_loss_mlp": 1.09781027, + "epoch": 0.4199692189303578, + "flos": 735261525504.0, + "grad_norm": 0.07104751702384272, + "language_loss": 0.85029149, + "learning_rate": 0.0006514831947879407, + "loss": 0.86145711, + "num_input_tokens_seen": 181756704, + "router_z_loss_mlp": 0.18737793, + "step": 2183, + "time_per_iteration": 2.990061044692993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107738, + "balance_loss_mlp": 1.08917689, + "epoch": 0.4201616006156214, + "flos": 750214794240.0, + "grad_norm": 0.10339737087855795, + "language_loss": 0.78075212, + "learning_rate": 0.0006511862651934091, + "loss": 0.79182947, + "num_input_tokens_seen": 181837952, + "router_z_loss_mlp": 0.18566895, + "step": 2184, + "time_per_iteration": 3.0668697357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107089, + "balance_loss_mlp": 1.08805084, + "epoch": 0.42035398230088494, + "flos": 547029448704.0, + "grad_norm": 0.06769253041220206, + "language_loss": 0.8183164, + "learning_rate": 0.0006508892769031638, + "loss": 0.82938731, + "num_input_tokens_seen": 181906896, + "router_z_loss_mlp": 0.19018555, + "step": 2185, + "time_per_iteration": 2.6562998294830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109571, + "balance_loss_mlp": 1.0908668, + "epoch": 0.42054636398614853, + "flos": 616911717888.0, + "grad_norm": 0.09820566679610492, + "language_loss": 0.86607713, + "learning_rate": 0.000650592230032506, + "loss": 0.87717283, + "num_input_tokens_seen": 181974976, + "router_z_loss_mlp": 0.18676758, + "step": 2186, + "time_per_iteration": 2.7687323093414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115297, + "balance_loss_mlp": 1.09592557, + "epoch": 0.42073874567141206, + "flos": 640394242560.0, + "grad_norm": 0.07480815577141971, + "language_loss": 0.84954965, + "learning_rate": 0.0006502951246967595, + "loss": 0.86070257, + "num_input_tokens_seen": 182054704, + "router_z_loss_mlp": 0.19360352, + "step": 2187, + "time_per_iteration": 2.8850929737091064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105112, + "balance_loss_mlp": 1.0856576, + "epoch": 0.42093112735667565, + "flos": 493783174656.0, + "grad_norm": 0.07526055561420332, + "language_loss": 0.86650884, + "learning_rate": 0.0006499979610112706, + "loss": 0.87756002, + "num_input_tokens_seen": 182129696, + "router_z_loss_mlp": 0.19445801, + "step": 2188, + "time_per_iteration": 2.6973655223846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09087813, + "epoch": 0.4211235090419392, + "flos": 542364512256.0, + "grad_norm": 0.09941258674264111, + "language_loss": 0.84241974, + "learning_rate": 0.000649700739091409, + "loss": 0.85352778, + "num_input_tokens_seen": 182203792, + "router_z_loss_mlp": 0.19921875, + "step": 2189, + "time_per_iteration": 2.701725482940674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067004, + "balance_loss_mlp": 1.05665708, + "epoch": 0.42131589072720277, + "flos": 1532149530624.0, + "grad_norm": 0.03283150548513283, + "language_loss": 0.73836273, + "learning_rate": 0.0006494034590525657, + "loss": 0.7490328, + "num_input_tokens_seen": 182432080, + "router_z_loss_mlp": 0.10351562, + "step": 2190, + "time_per_iteration": 4.839817523956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010909, + "balance_loss_mlp": 1.07154024, + "epoch": 0.42150827241246636, + "flos": 566852857344.0, + "grad_norm": 0.06598643326088396, + "language_loss": 0.85153967, + "learning_rate": 0.0006491061210101557, + "loss": 0.86244869, + "num_input_tokens_seen": 182500256, + "router_z_loss_mlp": 0.19335938, + "step": 2191, + "time_per_iteration": 2.7196173667907715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010923, + "balance_loss_mlp": 1.07263041, + "epoch": 0.4217006540977299, + "flos": 707561174016.0, + "grad_norm": 0.0656106941658015, + "language_loss": 0.83940744, + "learning_rate": 0.0006488087250796157, + "loss": 0.85033047, + "num_input_tokens_seen": 182582912, + "router_z_loss_mlp": 0.1965332, + "step": 2192, + "time_per_iteration": 2.906759262084961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092958, + "balance_loss_mlp": 1.07264447, + "epoch": 0.4218930357829935, + "flos": 627291454464.0, + "grad_norm": 0.07249831154737209, + "language_loss": 0.81628364, + "learning_rate": 0.0006485112713764049, + "loss": 0.82721323, + "num_input_tokens_seen": 182670304, + "router_z_loss_mlp": 0.203125, + "step": 2193, + "time_per_iteration": 2.92899227142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094124, + "balance_loss_mlp": 1.0746212, + "epoch": 0.422085417468257, + "flos": 460345628160.0, + "grad_norm": 0.06737861087768351, + "language_loss": 0.83769715, + "learning_rate": 0.0006482137600160051, + "loss": 0.8486383, + "num_input_tokens_seen": 182735024, + "router_z_loss_mlp": 0.19506836, + "step": 2194, + "time_per_iteration": 2.5262770652770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085984, + "balance_loss_mlp": 1.06623149, + "epoch": 0.4222777991535206, + "flos": 474026577408.0, + "grad_norm": 0.06292139363287808, + "language_loss": 0.845213, + "learning_rate": 0.0006479161911139206, + "loss": 0.85607278, + "num_input_tokens_seen": 182805024, + "router_z_loss_mlp": 0.1973877, + "step": 2195, + "time_per_iteration": 2.6160459518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108518, + "balance_loss_mlp": 1.06428266, + "epoch": 0.4224701808387841, + "flos": 470886151680.0, + "grad_norm": 0.08901996634588341, + "language_loss": 0.8583566, + "learning_rate": 0.0006476185647856778, + "loss": 0.8692084, + "num_input_tokens_seen": 182871360, + "router_z_loss_mlp": 0.2088623, + "step": 2196, + "time_per_iteration": 2.5868523120880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092161, + "balance_loss_mlp": 1.07174015, + "epoch": 0.4226625625240477, + "flos": 677525783040.0, + "grad_norm": 0.08593083287674207, + "language_loss": 0.8143295, + "learning_rate": 0.0006473208811468255, + "loss": 0.8252511, + "num_input_tokens_seen": 182952912, + "router_z_loss_mlp": 0.20422363, + "step": 2197, + "time_per_iteration": 2.8999974727630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094808, + "balance_loss_mlp": 1.07459044, + "epoch": 0.4228549442093113, + "flos": 503525652480.0, + "grad_norm": 0.06766081582077942, + "language_loss": 0.84457636, + "learning_rate": 0.0006470231403129347, + "loss": 0.85552448, + "num_input_tokens_seen": 183022016, + "router_z_loss_mlp": 0.20214844, + "step": 2198, + "time_per_iteration": 2.6292834281921387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100004, + "balance_loss_mlp": 1.08031106, + "epoch": 0.42304732589457483, + "flos": 611848857600.0, + "grad_norm": 0.06420895179660353, + "language_loss": 0.81433302, + "learning_rate": 0.0006467253423995988, + "loss": 0.82533306, + "num_input_tokens_seen": 183101776, + "router_z_loss_mlp": 0.19677734, + "step": 2199, + "time_per_iteration": 2.891252040863037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106456, + "balance_loss_mlp": 1.08667946, + "epoch": 0.4232397075798384, + "flos": 515570863104.0, + "grad_norm": 0.09520170564639865, + "language_loss": 0.79070157, + "learning_rate": 0.000646427487522433, + "loss": 0.80176616, + "num_input_tokens_seen": 183171392, + "router_z_loss_mlp": 0.19763184, + "step": 2200, + "time_per_iteration": 2.6773481369018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114103, + "balance_loss_mlp": 1.09451675, + "epoch": 0.42343208926510195, + "flos": 589796868096.0, + "grad_norm": 0.05852623049494667, + "language_loss": 0.8313483, + "learning_rate": 0.0006461295757970749, + "loss": 0.84248924, + "num_input_tokens_seen": 183253936, + "router_z_loss_mlp": 0.19567871, + "step": 2201, + "time_per_iteration": 2.8689796924591064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134362, + "balance_loss_mlp": 1.11422753, + "epoch": 0.42362447095036554, + "flos": 640636521984.0, + "grad_norm": 0.08800937436321304, + "language_loss": 0.8125912, + "learning_rate": 0.0006458316073391839, + "loss": 0.82393485, + "num_input_tokens_seen": 183333744, + "router_z_loss_mlp": 0.20141602, + "step": 2202, + "time_per_iteration": 2.88208270072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131451, + "balance_loss_mlp": 1.11259222, + "epoch": 0.42381685263562907, + "flos": 512680057344.0, + "grad_norm": 0.0666307669316128, + "language_loss": 0.87698853, + "learning_rate": 0.0006455335822644422, + "loss": 0.88830304, + "num_input_tokens_seen": 183401904, + "router_z_loss_mlp": 0.18847656, + "step": 2203, + "time_per_iteration": 2.670079469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148041, + "balance_loss_mlp": 1.12951636, + "epoch": 0.42400923432089266, + "flos": 546782400000.0, + "grad_norm": 0.09426146221356531, + "language_loss": 0.77927971, + "learning_rate": 0.0006452355006885527, + "loss": 0.79076016, + "num_input_tokens_seen": 183471312, + "router_z_loss_mlp": 0.18530273, + "step": 2204, + "time_per_iteration": 2.657381534576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113566, + "balance_loss_mlp": 1.11668229, + "epoch": 0.4242016160061562, + "flos": 622154815488.0, + "grad_norm": 0.09902645475712538, + "language_loss": 0.8715145, + "learning_rate": 0.0006449373627272412, + "loss": 0.88287115, + "num_input_tokens_seen": 183539184, + "router_z_loss_mlp": 0.18969727, + "step": 2205, + "time_per_iteration": 2.731816053390503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119544, + "balance_loss_mlp": 1.10088801, + "epoch": 0.4243939976914198, + "flos": 571913146368.0, + "grad_norm": 0.08117714281203407, + "language_loss": 0.82472396, + "learning_rate": 0.0006446391684962553, + "loss": 0.8359195, + "num_input_tokens_seen": 183607504, + "router_z_loss_mlp": 0.18652344, + "step": 2206, + "time_per_iteration": 2.6578545570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111897, + "balance_loss_mlp": 1.09364557, + "epoch": 0.42458637937668336, + "flos": 448740186624.0, + "grad_norm": 0.07468362398894425, + "language_loss": 0.83251357, + "learning_rate": 0.000644340918111364, + "loss": 0.84363258, + "num_input_tokens_seen": 183674720, + "router_z_loss_mlp": 0.18249512, + "step": 2207, + "time_per_iteration": 2.56805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116636, + "balance_loss_mlp": 1.09764564, + "epoch": 0.4247787610619469, + "flos": 435407602176.0, + "grad_norm": 0.07806782722385266, + "language_loss": 0.84652972, + "learning_rate": 0.0006440426116883585, + "loss": 0.85769606, + "num_input_tokens_seen": 183740448, + "router_z_loss_mlp": 0.18981934, + "step": 2208, + "time_per_iteration": 2.5546016693115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117381, + "balance_loss_mlp": 1.09860539, + "epoch": 0.4249711427472105, + "flos": 496078566912.0, + "grad_norm": 0.06957413499154663, + "language_loss": 0.86008334, + "learning_rate": 0.0006437442493430519, + "loss": 0.87125719, + "num_input_tokens_seen": 183812640, + "router_z_loss_mlp": 0.18762207, + "step": 2209, + "time_per_iteration": 2.709622621536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116308, + "balance_loss_mlp": 1.09817648, + "epoch": 0.425163524432474, + "flos": 655819587072.0, + "grad_norm": 0.07293604534963509, + "language_loss": 0.86852837, + "learning_rate": 0.000643445831191278, + "loss": 0.87969142, + "num_input_tokens_seen": 183895312, + "router_z_loss_mlp": 0.18127441, + "step": 2210, + "time_per_iteration": 2.9363558292388916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129839, + "balance_loss_mlp": 1.11201715, + "epoch": 0.4253559061177376, + "flos": 650608796160.0, + "grad_norm": 0.09052715570846585, + "language_loss": 0.81454134, + "learning_rate": 0.0006431473573488937, + "loss": 0.82583976, + "num_input_tokens_seen": 183966384, + "router_z_loss_mlp": 0.17834473, + "step": 2211, + "time_per_iteration": 2.824688196182251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113072, + "balance_loss_mlp": 1.09480882, + "epoch": 0.42554828780300114, + "flos": 554155333632.0, + "grad_norm": 0.1062817873742978, + "language_loss": 0.8489396, + "learning_rate": 0.0006428488279317765, + "loss": 0.86007035, + "num_input_tokens_seen": 184031728, + "router_z_loss_mlp": 0.18273926, + "step": 2212, + "time_per_iteration": 2.7016141414642334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115888, + "balance_loss_mlp": 1.0979948, + "epoch": 0.4257406694882647, + "flos": 514407200256.0, + "grad_norm": 0.11732172807674658, + "language_loss": 0.87377149, + "learning_rate": 0.0006425502430558259, + "loss": 0.88493037, + "num_input_tokens_seen": 184096160, + "router_z_loss_mlp": 0.17907715, + "step": 2213, + "time_per_iteration": 2.618800640106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119747, + "balance_loss_mlp": 1.10144818, + "epoch": 0.42593305117352825, + "flos": 515645015040.0, + "grad_norm": 0.0715384053232906, + "language_loss": 0.84687829, + "learning_rate": 0.0006422516028369628, + "loss": 0.85807574, + "num_input_tokens_seen": 184169664, + "router_z_loss_mlp": 0.18310547, + "step": 2214, + "time_per_iteration": 2.6705808639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111255, + "balance_loss_mlp": 1.09299207, + "epoch": 0.42612543285879184, + "flos": 588059813376.0, + "grad_norm": 0.10790889315219483, + "language_loss": 0.83148849, + "learning_rate": 0.0006419529073911296, + "loss": 0.84260106, + "num_input_tokens_seen": 184249152, + "router_z_loss_mlp": 0.18261719, + "step": 2215, + "time_per_iteration": 2.8703150749206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129195, + "balance_loss_mlp": 1.11081314, + "epoch": 0.42631781454405543, + "flos": 635472345600.0, + "grad_norm": 0.06359649877678734, + "language_loss": 0.85258245, + "learning_rate": 0.0006416541568342901, + "loss": 0.86387444, + "num_input_tokens_seen": 184326816, + "router_z_loss_mlp": 0.18383789, + "step": 2216, + "time_per_iteration": 2.8891868591308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150464, + "balance_loss_mlp": 1.13197434, + "epoch": 0.42651019622931896, + "flos": 541161202176.0, + "grad_norm": 0.08324056394919786, + "language_loss": 0.84084767, + "learning_rate": 0.0006413553512824297, + "loss": 0.85235232, + "num_input_tokens_seen": 184404336, + "router_z_loss_mlp": 0.18481445, + "step": 2217, + "time_per_iteration": 2.7485709190368652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114864, + "balance_loss_mlp": 1.13043642, + "epoch": 0.42670257791458255, + "flos": 558158045184.0, + "grad_norm": 0.07361406588428895, + "language_loss": 0.84362692, + "learning_rate": 0.0006410564908515549, + "loss": 0.85511333, + "num_input_tokens_seen": 184472320, + "router_z_loss_mlp": 0.18200684, + "step": 2218, + "time_per_iteration": 2.657747507095337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147496, + "balance_loss_mlp": 1.12895846, + "epoch": 0.4268949595998461, + "flos": 621309782016.0, + "grad_norm": 0.08313238940479123, + "language_loss": 0.85059869, + "learning_rate": 0.0006407575756576935, + "loss": 0.86207366, + "num_input_tokens_seen": 184544704, + "router_z_loss_mlp": 0.18530273, + "step": 2219, + "time_per_iteration": 2.7391462326049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151304, + "balance_loss_mlp": 1.13211131, + "epoch": 0.42708734128510967, + "flos": 537919460352.0, + "grad_norm": 0.08558880584649159, + "language_loss": 0.87292302, + "learning_rate": 0.0006404586058168951, + "loss": 0.88443601, + "num_input_tokens_seen": 184622544, + "router_z_loss_mlp": 0.19189453, + "step": 2220, + "time_per_iteration": 2.7562613487243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142405, + "balance_loss_mlp": 1.12310505, + "epoch": 0.4272797229703732, + "flos": 502865998848.0, + "grad_norm": 0.08712204240656665, + "language_loss": 0.86527437, + "learning_rate": 0.0006401595814452296, + "loss": 0.87669843, + "num_input_tokens_seen": 184692544, + "router_z_loss_mlp": 0.19287109, + "step": 2221, + "time_per_iteration": 2.6396138668060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120795, + "balance_loss_mlp": 1.10141122, + "epoch": 0.4274721046556368, + "flos": 492453955584.0, + "grad_norm": 0.07683160316407273, + "language_loss": 0.80591571, + "learning_rate": 0.000639860502658789, + "loss": 0.81712359, + "num_input_tokens_seen": 184760480, + "router_z_loss_mlp": 0.19360352, + "step": 2222, + "time_per_iteration": 2.655627489089966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115469, + "balance_loss_mlp": 1.09618044, + "epoch": 0.4276644863409004, + "flos": 568367456256.0, + "grad_norm": 0.0619683298423062, + "language_loss": 0.85100698, + "learning_rate": 0.0006395613695736853, + "loss": 0.86216164, + "num_input_tokens_seen": 184834080, + "router_z_loss_mlp": 0.19287109, + "step": 2223, + "time_per_iteration": 2.701129674911499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103811, + "balance_loss_mlp": 1.08472598, + "epoch": 0.4278568680261639, + "flos": 607436112384.0, + "grad_norm": 0.07797079059499014, + "language_loss": 0.81455553, + "learning_rate": 0.0006392621823060529, + "loss": 0.82559359, + "num_input_tokens_seen": 184905872, + "router_z_loss_mlp": 0.19067383, + "step": 2224, + "time_per_iteration": 2.7364578247070312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099286, + "balance_loss_mlp": 1.08043897, + "epoch": 0.4280492497114275, + "flos": 560527589376.0, + "grad_norm": 0.08496205952123127, + "language_loss": 0.84790826, + "learning_rate": 0.0006389629409720465, + "loss": 0.85890114, + "num_input_tokens_seen": 184972320, + "router_z_loss_mlp": 0.18835449, + "step": 2225, + "time_per_iteration": 2.673173427581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109636, + "balance_loss_mlp": 1.07715571, + "epoch": 0.428241631396691, + "flos": 720646709760.0, + "grad_norm": 0.0715414323965843, + "language_loss": 0.88466454, + "learning_rate": 0.0006386636456878417, + "loss": 0.89562809, + "num_input_tokens_seen": 185051040, + "router_z_loss_mlp": 0.19177246, + "step": 2226, + "time_per_iteration": 2.9119651317596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098271, + "balance_loss_mlp": 1.07898331, + "epoch": 0.4284340130819546, + "flos": 429467774976.0, + "grad_norm": 0.09078876082736503, + "language_loss": 0.91914666, + "learning_rate": 0.0006383642965696353, + "loss": 0.93012941, + "num_input_tokens_seen": 185113552, + "router_z_loss_mlp": 0.19262695, + "step": 2227, + "time_per_iteration": 2.546172618865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105185, + "balance_loss_mlp": 1.08565903, + "epoch": 0.42862639476721814, + "flos": 525016733184.0, + "grad_norm": 0.10289049243839221, + "language_loss": 0.83054781, + "learning_rate": 0.000638064893733645, + "loss": 0.84159964, + "num_input_tokens_seen": 185185056, + "router_z_loss_mlp": 0.19506836, + "step": 2228, + "time_per_iteration": 2.752192735671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110948, + "balance_loss_mlp": 1.09085989, + "epoch": 0.42881877645248173, + "flos": 465346446336.0, + "grad_norm": 0.15473525900744378, + "language_loss": 0.89614534, + "learning_rate": 0.000637765437296109, + "loss": 0.90724015, + "num_input_tokens_seen": 185257248, + "router_z_loss_mlp": 0.18615723, + "step": 2229, + "time_per_iteration": 2.6742892265319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106608, + "balance_loss_mlp": 1.08742726, + "epoch": 0.42901115813774526, + "flos": 560297793024.0, + "grad_norm": 0.06911950421263405, + "language_loss": 0.8512131, + "learning_rate": 0.000637465927373287, + "loss": 0.86227918, + "num_input_tokens_seen": 185324800, + "router_z_loss_mlp": 0.19165039, + "step": 2230, + "time_per_iteration": 2.6567254066467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103693, + "balance_loss_mlp": 1.08500099, + "epoch": 0.42920353982300885, + "flos": 561454115328.0, + "grad_norm": 0.08280955993669904, + "language_loss": 0.78714275, + "learning_rate": 0.000637166364081459, + "loss": 0.79817969, + "num_input_tokens_seen": 185393408, + "router_z_loss_mlp": 0.18688965, + "step": 2231, + "time_per_iteration": 2.671881914138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118758, + "balance_loss_mlp": 1.10104382, + "epoch": 0.42939592150827244, + "flos": 556248093696.0, + "grad_norm": 0.10217834412041502, + "language_loss": 0.84177876, + "learning_rate": 0.0006368667475369256, + "loss": 0.85296631, + "num_input_tokens_seen": 185467968, + "router_z_loss_mlp": 0.17736816, + "step": 2232, + "time_per_iteration": 2.760406732559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_mlp": 1.03175175, + "epoch": 0.42958830319353597, + "flos": 1521623688192.0, + "grad_norm": 0.029167273687310865, + "language_loss": 0.78527778, + "learning_rate": 0.0006365670778560084, + "loss": 0.79569829, + "num_input_tokens_seen": 185705232, + "router_z_loss_mlp": 0.10302734, + "step": 2233, + "time_per_iteration": 4.915542840957642 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_mlp": 1.02887213, + "epoch": 0.42978068487879956, + "flos": 1495813837824.0, + "grad_norm": 0.028672121204767892, + "language_loss": 0.78895426, + "learning_rate": 0.0006362673551550494, + "loss": 0.79934502, + "num_input_tokens_seen": 185932672, + "router_z_loss_mlp": 0.10205078, + "step": 2234, + "time_per_iteration": 4.8368518352508545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158883, + "balance_loss_mlp": 1.14040589, + "epoch": 0.4299730665640631, + "flos": 546992372736.0, + "grad_norm": 0.1071521836349002, + "language_loss": 0.85815042, + "learning_rate": 0.0006359675795504112, + "loss": 0.86973917, + "num_input_tokens_seen": 186006288, + "router_z_loss_mlp": 0.18481445, + "step": 2235, + "time_per_iteration": 2.689207077026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157771, + "balance_loss_mlp": 1.13929391, + "epoch": 0.4301654482493267, + "flos": 1129293342720.0, + "grad_norm": 0.08968188926211089, + "language_loss": 0.74473494, + "learning_rate": 0.0006356677511584775, + "loss": 0.75631261, + "num_input_tokens_seen": 186097168, + "router_z_loss_mlp": 0.18481445, + "step": 2236, + "time_per_iteration": 3.4835057258605957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140943, + "balance_loss_mlp": 1.12231028, + "epoch": 0.4303578299345902, + "flos": 495750025728.0, + "grad_norm": 0.07661214353194774, + "language_loss": 0.86188674, + "learning_rate": 0.0006353678700956511, + "loss": 0.8732962, + "num_input_tokens_seen": 186163904, + "router_z_loss_mlp": 0.18615723, + "step": 2237, + "time_per_iteration": 2.5932724475860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122051, + "balance_loss_mlp": 1.10363352, + "epoch": 0.4305502116198538, + "flos": 615762736128.0, + "grad_norm": 0.10135375141644645, + "language_loss": 0.83612645, + "learning_rate": 0.0006350679364783569, + "loss": 0.84734702, + "num_input_tokens_seen": 186233888, + "router_z_loss_mlp": 0.1842041, + "step": 2238, + "time_per_iteration": 2.799832582473755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116492, + "balance_loss_mlp": 1.09846783, + "epoch": 0.4307425933051173, + "flos": 559260039168.0, + "grad_norm": 0.08578747749075483, + "language_loss": 0.85542685, + "learning_rate": 0.0006347679504230393, + "loss": 0.86659181, + "num_input_tokens_seen": 186301168, + "router_z_loss_mlp": 0.18041992, + "step": 2239, + "time_per_iteration": 2.692394971847534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101167, + "balance_loss_mlp": 1.08270121, + "epoch": 0.4309349749903809, + "flos": 972166344192.0, + "grad_norm": 0.07961944034188723, + "language_loss": 0.76030314, + "learning_rate": 0.0006344679120461632, + "loss": 0.77131486, + "num_input_tokens_seen": 186392096, + "router_z_loss_mlp": 0.18444824, + "step": 2240, + "time_per_iteration": 3.3374927043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095492, + "balance_loss_mlp": 1.07701421, + "epoch": 0.4311273566756445, + "flos": 541924743168.0, + "grad_norm": 0.0793940534533153, + "language_loss": 0.7985338, + "learning_rate": 0.0006341678214642134, + "loss": 0.80948877, + "num_input_tokens_seen": 186458000, + "router_z_loss_mlp": 0.18469238, + "step": 2241, + "time_per_iteration": 2.6277148723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106602, + "balance_loss_mlp": 1.08830297, + "epoch": 0.43131973836090803, + "flos": 761674503168.0, + "grad_norm": 0.08042276557968771, + "language_loss": 0.82835627, + "learning_rate": 0.0006338676787936963, + "loss": 0.83942229, + "num_input_tokens_seen": 186544992, + "router_z_loss_mlp": 0.18286133, + "step": 2242, + "time_per_iteration": 3.1297900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108169, + "balance_loss_mlp": 1.08982253, + "epoch": 0.4315121200461716, + "flos": 554530862592.0, + "grad_norm": 0.09204417916973401, + "language_loss": 0.8383373, + "learning_rate": 0.0006335674841511367, + "loss": 0.84941894, + "num_input_tokens_seen": 186614960, + "router_z_loss_mlp": 0.18347168, + "step": 2243, + "time_per_iteration": 2.667814254760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093207, + "balance_loss_mlp": 1.08414674, + "epoch": 0.43170450173143515, + "flos": 1485334609920.0, + "grad_norm": 0.03538748768114217, + "language_loss": 0.7918117, + "learning_rate": 0.000633267237653081, + "loss": 0.80274379, + "num_input_tokens_seen": 186854288, + "router_z_loss_mlp": 0.09082031, + "step": 2244, + "time_per_iteration": 4.997291803359985 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085049, + "balance_loss_mlp": 1.07603705, + "epoch": 0.43189688341669874, + "flos": 1473697234944.0, + "grad_norm": 0.03507908076143408, + "language_loss": 0.77365553, + "learning_rate": 0.0006329669394160953, + "loss": 0.78450596, + "num_input_tokens_seen": 187090272, + "router_z_loss_mlp": 0.09033203, + "step": 2245, + "time_per_iteration": 4.884565591812134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114789, + "balance_loss_mlp": 1.09558439, + "epoch": 0.43208926510196227, + "flos": 492938141184.0, + "grad_norm": 0.08187280769981854, + "language_loss": 0.82496786, + "learning_rate": 0.0006326665895567652, + "loss": 0.83611572, + "num_input_tokens_seen": 187157584, + "router_z_loss_mlp": 0.1920166, + "step": 2246, + "time_per_iteration": 2.6677396297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123468, + "balance_loss_mlp": 1.10469246, + "epoch": 0.43228164678722586, + "flos": 520235799552.0, + "grad_norm": 0.08598825839477024, + "language_loss": 0.86984897, + "learning_rate": 0.0006323661881916976, + "loss": 0.88108367, + "num_input_tokens_seen": 187229408, + "router_z_loss_mlp": 0.1875, + "step": 2247, + "time_per_iteration": 2.7388386726379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117292, + "balance_loss_mlp": 1.09908867, + "epoch": 0.4324740284724894, + "flos": 796056201216.0, + "grad_norm": 0.06738996012815959, + "language_loss": 0.80918467, + "learning_rate": 0.0006320657354375179, + "loss": 0.82035756, + "num_input_tokens_seen": 187304384, + "router_z_loss_mlp": 0.18212891, + "step": 2248, + "time_per_iteration": 3.047557830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130282, + "balance_loss_mlp": 1.11192417, + "epoch": 0.432666410157753, + "flos": 482153140224.0, + "grad_norm": 0.08033421843515161, + "language_loss": 0.86710787, + "learning_rate": 0.0006317652314108726, + "loss": 0.8784107, + "num_input_tokens_seen": 187368064, + "router_z_loss_mlp": 0.18347168, + "step": 2249, + "time_per_iteration": 2.547611713409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121909, + "balance_loss_mlp": 1.10406351, + "epoch": 0.43285879184301657, + "flos": 500212329984.0, + "grad_norm": 0.07824522100123071, + "language_loss": 0.91323555, + "learning_rate": 0.0006314646762284277, + "loss": 0.92445469, + "num_input_tokens_seen": 187436320, + "router_z_loss_mlp": 0.17858887, + "step": 2250, + "time_per_iteration": 2.648721933364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024653, + "balance_loss_mlp": 1.01502049, + "epoch": 0.4330511735282801, + "flos": 1510448103936.0, + "grad_norm": 0.012196079218770799, + "language_loss": 0.75425828, + "learning_rate": 0.0006311640700068691, + "loss": 0.76450479, + "num_input_tokens_seen": 187670912, + "router_z_loss_mlp": 0.09619141, + "step": 2251, + "time_per_iteration": 4.9720799922943115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134181, + "balance_loss_mlp": 1.11596584, + "epoch": 0.4332435552135437, + "flos": 699582915072.0, + "grad_norm": 0.07706489930265227, + "language_loss": 0.77657586, + "learning_rate": 0.0006308634128629022, + "loss": 0.78791773, + "num_input_tokens_seen": 187746432, + "router_z_loss_mlp": 0.18225098, + "step": 2252, + "time_per_iteration": 2.898723602294922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131863, + "balance_loss_mlp": 1.11357653, + "epoch": 0.4334359368988072, + "flos": 592292321280.0, + "grad_norm": 0.09977200174188003, + "language_loss": 0.87270236, + "learning_rate": 0.0006305627049132531, + "loss": 0.88402092, + "num_input_tokens_seen": 187820032, + "router_z_loss_mlp": 0.18286133, + "step": 2253, + "time_per_iteration": 2.854081153869629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120951, + "balance_loss_mlp": 1.1019249, + "epoch": 0.4336283185840708, + "flos": 842806508544.0, + "grad_norm": 0.08155008814068082, + "language_loss": 0.8592571, + "learning_rate": 0.0006302619462746662, + "loss": 0.87046659, + "num_input_tokens_seen": 187904400, + "router_z_loss_mlp": 0.19018555, + "step": 2254, + "time_per_iteration": 3.164759397506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111221, + "balance_loss_mlp": 1.09445965, + "epoch": 0.43382070026933434, + "flos": 626258843136.0, + "grad_norm": 0.0732322900076577, + "language_loss": 0.90031815, + "learning_rate": 0.0006299611370639069, + "loss": 0.91144025, + "num_input_tokens_seen": 187973264, + "router_z_loss_mlp": 0.17773438, + "step": 2255, + "time_per_iteration": 2.753937005996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111399, + "balance_loss_mlp": 1.09258795, + "epoch": 0.4340130819545979, + "flos": 591111406080.0, + "grad_norm": 0.07459277492074774, + "language_loss": 0.79176068, + "learning_rate": 0.0006296602773977593, + "loss": 0.80287468, + "num_input_tokens_seen": 188039984, + "router_z_loss_mlp": 0.18798828, + "step": 2256, + "time_per_iteration": 2.720043659210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111381, + "balance_loss_mlp": 1.09282053, + "epoch": 0.4342054636398615, + "flos": 490889797632.0, + "grad_norm": 0.06314614385855079, + "language_loss": 0.873402, + "learning_rate": 0.0006293593673930277, + "loss": 0.88451576, + "num_input_tokens_seen": 188113456, + "router_z_loss_mlp": 0.18566895, + "step": 2257, + "time_per_iteration": 2.7014408111572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102122, + "balance_loss_mlp": 1.0837394, + "epoch": 0.43439784532512504, + "flos": 698994842112.0, + "grad_norm": 0.07573255135522808, + "language_loss": 0.78537059, + "learning_rate": 0.0006290584071665358, + "loss": 0.79639179, + "num_input_tokens_seen": 188192480, + "router_z_loss_mlp": 0.18371582, + "step": 2258, + "time_per_iteration": 2.9237425327301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109661, + "balance_loss_mlp": 1.09070623, + "epoch": 0.43459022701038863, + "flos": 485824739328.0, + "grad_norm": 0.09488327166679841, + "language_loss": 0.82044512, + "learning_rate": 0.0006287573968351266, + "loss": 0.83154172, + "num_input_tokens_seen": 188258784, + "router_z_loss_mlp": 0.18945312, + "step": 2259, + "time_per_iteration": 2.574779748916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100612, + "balance_loss_mlp": 1.08195579, + "epoch": 0.43478260869565216, + "flos": 643107382272.0, + "grad_norm": 0.08898100409874855, + "language_loss": 0.82007015, + "learning_rate": 0.0006284563365156626, + "loss": 0.83107626, + "num_input_tokens_seen": 188331312, + "router_z_loss_mlp": 0.18652344, + "step": 2260, + "time_per_iteration": 2.8346612453460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107563, + "balance_loss_mlp": 1.08845389, + "epoch": 0.43497499038091575, + "flos": 426097552896.0, + "grad_norm": 0.09100088182337301, + "language_loss": 0.87183499, + "learning_rate": 0.0006281552263250261, + "loss": 0.88291061, + "num_input_tokens_seen": 188393712, + "router_z_loss_mlp": 0.19116211, + "step": 2261, + "time_per_iteration": 2.549306631088257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054669, + "balance_loss_mlp": 1.04460812, + "epoch": 0.4351673720661793, + "flos": 1538378625024.0, + "grad_norm": 0.02508916228462863, + "language_loss": 0.80691534, + "learning_rate": 0.000627854066380118, + "loss": 0.81746203, + "num_input_tokens_seen": 188621152, + "router_z_loss_mlp": 0.10058594, + "step": 2262, + "time_per_iteration": 4.837932348251343 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104657, + "balance_loss_mlp": 1.08554804, + "epoch": 0.43535975375144287, + "flos": 749155018752.0, + "grad_norm": 0.08522062407758652, + "language_loss": 0.81544203, + "learning_rate": 0.0006275528567978593, + "loss": 0.82648861, + "num_input_tokens_seen": 188697120, + "router_z_loss_mlp": 0.19091797, + "step": 2263, + "time_per_iteration": 2.936924934387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112769, + "balance_loss_mlp": 1.09411263, + "epoch": 0.4355521354367064, + "flos": 861280874496.0, + "grad_norm": 0.07411268466258768, + "language_loss": 0.826931, + "learning_rate": 0.0006272515976951898, + "loss": 0.83805871, + "num_input_tokens_seen": 188778480, + "router_z_loss_mlp": 0.18640137, + "step": 2264, + "time_per_iteration": 3.0930423736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107636, + "balance_loss_mlp": 1.08872962, + "epoch": 0.43574451712197, + "flos": 734527719936.0, + "grad_norm": 0.09109036690828846, + "language_loss": 0.79239774, + "learning_rate": 0.0006269502891890687, + "loss": 0.80347407, + "num_input_tokens_seen": 188863616, + "router_z_loss_mlp": 0.18896484, + "step": 2265, + "time_per_iteration": 3.0183792114257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107502, + "balance_loss_mlp": 1.08883369, + "epoch": 0.4359368988072336, + "flos": 570578784768.0, + "grad_norm": 0.05550243860706018, + "language_loss": 0.87779111, + "learning_rate": 0.0006266489313964743, + "loss": 0.88886613, + "num_input_tokens_seen": 188933984, + "router_z_loss_mlp": 0.18652344, + "step": 2266, + "time_per_iteration": 2.7831835746765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.10263872, + "epoch": 0.4361292804924971, + "flos": 555528969216.0, + "grad_norm": 0.0703513545387446, + "language_loss": 0.85298383, + "learning_rate": 0.0006263475244344041, + "loss": 0.86419421, + "num_input_tokens_seen": 189012976, + "router_z_loss_mlp": 0.18395996, + "step": 2267, + "time_per_iteration": 2.857132911682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118688, + "balance_loss_mlp": 1.10052013, + "epoch": 0.4363216621777607, + "flos": 557285847552.0, + "grad_norm": 0.08642791248778911, + "language_loss": 0.84379327, + "learning_rate": 0.0006260460684198746, + "loss": 0.85498011, + "num_input_tokens_seen": 189079664, + "router_z_loss_mlp": 0.1817627, + "step": 2268, + "time_per_iteration": 2.692237138748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107605, + "balance_loss_mlp": 1.08955705, + "epoch": 0.4365140438630242, + "flos": 478222009344.0, + "grad_norm": 0.0923795472926113, + "language_loss": 0.84379983, + "learning_rate": 0.0006257445634699213, + "loss": 0.85487592, + "num_input_tokens_seen": 189144688, + "router_z_loss_mlp": 0.18066406, + "step": 2269, + "time_per_iteration": 2.5514066219329834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113893, + "balance_loss_mlp": 1.0958451, + "epoch": 0.4367064255482878, + "flos": 578917891584.0, + "grad_norm": 0.07185982898842977, + "language_loss": 0.82919574, + "learning_rate": 0.0006254430097015993, + "loss": 0.84033465, + "num_input_tokens_seen": 189213984, + "router_z_loss_mlp": 0.18054199, + "step": 2270, + "time_per_iteration": 2.70414662361145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039786, + "balance_loss_mlp": 1.02981973, + "epoch": 0.43689880723355135, + "flos": 1458946225152.0, + "grad_norm": 0.018847560898896817, + "language_loss": 0.76479089, + "learning_rate": 0.0006251414072319815, + "loss": 0.7751888, + "num_input_tokens_seen": 189434416, + "router_z_loss_mlp": 0.09960938, + "step": 2271, + "time_per_iteration": 4.881477355957031 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.09232235, + "epoch": 0.43709118891881493, + "flos": 667610408448.0, + "grad_norm": 0.06834440940873689, + "language_loss": 0.85169542, + "learning_rate": 0.0006248397561781609, + "loss": 0.86278993, + "num_input_tokens_seen": 189513248, + "router_z_loss_mlp": 0.17138672, + "step": 2272, + "time_per_iteration": 2.9807589054107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114956, + "balance_loss_mlp": 1.09752727, + "epoch": 0.43728357060407846, + "flos": 544872448512.0, + "grad_norm": 0.08779020279595867, + "language_loss": 0.85627788, + "learning_rate": 0.0006245380566572482, + "loss": 0.86742747, + "num_input_tokens_seen": 189585392, + "router_z_loss_mlp": 0.17456055, + "step": 2273, + "time_per_iteration": 2.6780998706817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113394, + "balance_loss_mlp": 1.09640646, + "epoch": 0.43747595228934205, + "flos": 746839802880.0, + "grad_norm": 0.07304773845504615, + "language_loss": 0.75851929, + "learning_rate": 0.0006242363087863744, + "loss": 0.7696532, + "num_input_tokens_seen": 189667552, + "router_z_loss_mlp": 0.17004395, + "step": 2274, + "time_per_iteration": 2.9744510650634766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116168, + "balance_loss_mlp": 1.0989182, + "epoch": 0.43766833397460564, + "flos": 631353636864.0, + "grad_norm": 0.1377417309618575, + "language_loss": 0.86166036, + "learning_rate": 0.0006239345126826878, + "loss": 0.87282199, + "num_input_tokens_seen": 189742048, + "router_z_loss_mlp": 0.17272949, + "step": 2275, + "time_per_iteration": 2.7981135845184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108683, + "balance_loss_mlp": 1.09152877, + "epoch": 0.43786071565986917, + "flos": 530986295808.0, + "grad_norm": 0.07859590561046474, + "language_loss": 0.83992988, + "learning_rate": 0.0006236326684633561, + "loss": 0.8510167, + "num_input_tokens_seen": 189817968, + "router_z_loss_mlp": 0.17175293, + "step": 2276, + "time_per_iteration": 2.818861722946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112467, + "balance_loss_mlp": 1.09526503, + "epoch": 0.43805309734513276, + "flos": 538547180544.0, + "grad_norm": 0.07703424900820159, + "language_loss": 0.74875319, + "learning_rate": 0.0006233307762455658, + "loss": 0.75987786, + "num_input_tokens_seen": 189882608, + "router_z_loss_mlp": 0.17224121, + "step": 2277, + "time_per_iteration": 2.6329345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113593, + "balance_loss_mlp": 1.09641492, + "epoch": 0.4382454790303963, + "flos": 864542439936.0, + "grad_norm": 0.08103172587748399, + "language_loss": 0.83020627, + "learning_rate": 0.0006230288361465216, + "loss": 0.84134221, + "num_input_tokens_seen": 189960608, + "router_z_loss_mlp": 0.17199707, + "step": 2278, + "time_per_iteration": 3.093740701675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121877, + "balance_loss_mlp": 1.10465097, + "epoch": 0.4384378607156599, + "flos": 765499548672.0, + "grad_norm": 0.0865781646571655, + "language_loss": 0.8464967, + "learning_rate": 0.0006227268482834473, + "loss": 0.85771543, + "num_input_tokens_seen": 190035472, + "router_z_loss_mlp": 0.17248535, + "step": 2279, + "time_per_iteration": 2.9201176166534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125481, + "balance_loss_mlp": 1.10830259, + "epoch": 0.4386302424009234, + "flos": 668566669824.0, + "grad_norm": 0.07906200997295257, + "language_loss": 0.86881065, + "learning_rate": 0.000622424812773585, + "loss": 0.88006544, + "num_input_tokens_seen": 190109312, + "router_z_loss_mlp": 0.17199707, + "step": 2280, + "time_per_iteration": 2.8375024795532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133302, + "balance_loss_mlp": 1.11602879, + "epoch": 0.438822624086187, + "flos": 485182338048.0, + "grad_norm": 0.07902412331438459, + "language_loss": 0.79696977, + "learning_rate": 0.000622122729734195, + "loss": 0.80830276, + "num_input_tokens_seen": 190174176, + "router_z_loss_mlp": 0.17285156, + "step": 2281, + "time_per_iteration": 2.587625741958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127945, + "balance_loss_mlp": 1.11082637, + "epoch": 0.4390150057714506, + "flos": 499218992640.0, + "grad_norm": 0.06489318495758713, + "language_loss": 0.87247634, + "learning_rate": 0.0006218205992825566, + "loss": 0.8837558, + "num_input_tokens_seen": 190243888, + "router_z_loss_mlp": 0.17138672, + "step": 2282, + "time_per_iteration": 2.6426842212677 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132622, + "balance_loss_mlp": 1.11561131, + "epoch": 0.4392073874567141, + "flos": 558219714048.0, + "grad_norm": 0.07249325505401696, + "language_loss": 0.81692946, + "learning_rate": 0.0006215184215359671, + "loss": 0.82825571, + "num_input_tokens_seen": 190317504, + "router_z_loss_mlp": 0.17016602, + "step": 2283, + "time_per_iteration": 2.7548625469207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131603, + "balance_loss_mlp": 1.11440063, + "epoch": 0.4393997691419777, + "flos": 605306276352.0, + "grad_norm": 0.07525739768421633, + "language_loss": 0.86762762, + "learning_rate": 0.0006212161966117425, + "loss": 0.87894368, + "num_input_tokens_seen": 190390160, + "router_z_loss_mlp": 0.17224121, + "step": 2284, + "time_per_iteration": 2.738553762435913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131945, + "balance_loss_mlp": 1.11446857, + "epoch": 0.43959215082724123, + "flos": 804145688064.0, + "grad_norm": 0.077553661572433, + "language_loss": 0.81615996, + "learning_rate": 0.0006209139246272164, + "loss": 0.82747942, + "num_input_tokens_seen": 190467600, + "router_z_loss_mlp": 0.17492676, + "step": 2285, + "time_per_iteration": 3.024388074874878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133353, + "balance_loss_mlp": 1.11548376, + "epoch": 0.4397845325125048, + "flos": 487643286528.0, + "grad_norm": 0.07341525875363067, + "language_loss": 0.81566632, + "learning_rate": 0.0006206116056997421, + "loss": 0.8269999, + "num_input_tokens_seen": 190534192, + "router_z_loss_mlp": 0.17871094, + "step": 2286, + "time_per_iteration": 2.5751805305480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130534, + "balance_loss_mlp": 1.11304617, + "epoch": 0.43997691419776835, + "flos": 480811438080.0, + "grad_norm": 0.0674295682524957, + "language_loss": 0.82623774, + "learning_rate": 0.0006203092399466892, + "loss": 0.83754307, + "num_input_tokens_seen": 190601440, + "router_z_loss_mlp": 0.17504883, + "step": 2287, + "time_per_iteration": 2.557861566543579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142808, + "balance_loss_mlp": 1.12514091, + "epoch": 0.44016929588303194, + "flos": 483124082688.0, + "grad_norm": 0.055585597684010626, + "language_loss": 0.84940028, + "learning_rate": 0.0006200068274854473, + "loss": 0.8608284, + "num_input_tokens_seen": 190672528, + "router_z_loss_mlp": 0.17700195, + "step": 2288, + "time_per_iteration": 2.6604013442993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139053, + "balance_loss_mlp": 1.12110031, + "epoch": 0.4403616775682955, + "flos": 571853675520.0, + "grad_norm": 0.05756252195592342, + "language_loss": 0.85686207, + "learning_rate": 0.0006197043684334229, + "loss": 0.86825264, + "num_input_tokens_seen": 190750704, + "router_z_loss_mlp": 0.17956543, + "step": 2289, + "time_per_iteration": 2.7742552757263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136744, + "balance_loss_mlp": 1.11905324, + "epoch": 0.44055405925355906, + "flos": 630849627648.0, + "grad_norm": 0.09031384979596896, + "language_loss": 0.78885317, + "learning_rate": 0.0006194018629080411, + "loss": 0.80022061, + "num_input_tokens_seen": 190821664, + "router_z_loss_mlp": 0.17712402, + "step": 2290, + "time_per_iteration": 2.755141019821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143468, + "balance_loss_mlp": 1.12530041, + "epoch": 0.44074644093882265, + "flos": 536782961664.0, + "grad_norm": 0.10381992178140695, + "language_loss": 0.81444335, + "learning_rate": 0.0006190993110267451, + "loss": 0.82587808, + "num_input_tokens_seen": 190893888, + "router_z_loss_mlp": 0.18164062, + "step": 2291, + "time_per_iteration": 2.717245578765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138273, + "balance_loss_mlp": 1.1200701, + "epoch": 0.4409388226240862, + "flos": 463229093376.0, + "grad_norm": 0.06842071551306793, + "language_loss": 0.84298384, + "learning_rate": 0.0006187967129069958, + "loss": 0.8543666, + "num_input_tokens_seen": 190956800, + "router_z_loss_mlp": 0.18212891, + "step": 2292, + "time_per_iteration": 2.540931463241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139526, + "balance_loss_mlp": 1.12121558, + "epoch": 0.44113120430934977, + "flos": 566005252608.0, + "grad_norm": 0.07329037094919502, + "language_loss": 0.86953282, + "learning_rate": 0.0006184940686662722, + "loss": 0.88092804, + "num_input_tokens_seen": 191032048, + "router_z_loss_mlp": 0.18322754, + "step": 2293, + "time_per_iteration": 2.7757341861724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140768, + "balance_loss_mlp": 1.1223979, + "epoch": 0.4413235859946133, + "flos": 543585074688.0, + "grad_norm": 0.08855099948535183, + "language_loss": 0.89983863, + "learning_rate": 0.0006181913784220714, + "loss": 0.9112463, + "num_input_tokens_seen": 191099952, + "router_z_loss_mlp": 0.18371582, + "step": 2294, + "time_per_iteration": 2.723515510559082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040412, + "balance_loss_mlp": 1.03092277, + "epoch": 0.4415159676798769, + "flos": 1569871342080.0, + "grad_norm": 0.030293744399198016, + "language_loss": 0.80553782, + "learning_rate": 0.0006178886422919078, + "loss": 0.81594193, + "num_input_tokens_seen": 191335968, + "router_z_loss_mlp": 0.09472656, + "step": 2295, + "time_per_iteration": 4.940914630889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125035, + "balance_loss_mlp": 1.10708177, + "epoch": 0.4417083493651404, + "flos": 658740128256.0, + "grad_norm": 0.07282895932349266, + "language_loss": 0.79783386, + "learning_rate": 0.0006175858603933146, + "loss": 0.80908418, + "num_input_tokens_seen": 191410112, + "router_z_loss_mlp": 0.1796875, + "step": 2296, + "time_per_iteration": 2.9011893272399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117973, + "balance_loss_mlp": 1.09999609, + "epoch": 0.441900731050404, + "flos": 740457635328.0, + "grad_norm": 0.07093452663269637, + "language_loss": 0.80995864, + "learning_rate": 0.0006172830328438416, + "loss": 0.82113832, + "num_input_tokens_seen": 191491552, + "router_z_loss_mlp": 0.17993164, + "step": 2297, + "time_per_iteration": 2.984313726425171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115845, + "balance_loss_mlp": 1.09765363, + "epoch": 0.44209311273566754, + "flos": 539441399808.0, + "grad_norm": 0.06543332431983825, + "language_loss": 0.87005913, + "learning_rate": 0.0006169801597610572, + "loss": 0.8812176, + "num_input_tokens_seen": 191567872, + "router_z_loss_mlp": 0.18212891, + "step": 2298, + "time_per_iteration": 2.7446672916412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105385, + "balance_loss_mlp": 1.08803988, + "epoch": 0.4422854944209311, + "flos": 621613730304.0, + "grad_norm": 0.09691889340683667, + "language_loss": 0.89723885, + "learning_rate": 0.0006166772412625469, + "loss": 0.90829265, + "num_input_tokens_seen": 191638032, + "router_z_loss_mlp": 0.17358398, + "step": 2299, + "time_per_iteration": 2.8357315063476562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107801, + "balance_loss_mlp": 1.08962202, + "epoch": 0.4424778761061947, + "flos": 658824192000.0, + "grad_norm": 0.10216386732709903, + "language_loss": 0.81670028, + "learning_rate": 0.0006163742774659141, + "loss": 0.82777828, + "num_input_tokens_seen": 191709104, + "router_z_loss_mlp": 0.1817627, + "step": 2300, + "time_per_iteration": 2.886781692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095311, + "balance_loss_mlp": 1.07751346, + "epoch": 0.44267025779145824, + "flos": 568577428992.0, + "grad_norm": 0.07973359147829089, + "language_loss": 0.85959738, + "learning_rate": 0.0006160712684887801, + "loss": 0.87055051, + "num_input_tokens_seen": 191787072, + "router_z_loss_mlp": 0.17822266, + "step": 2301, + "time_per_iteration": 2.7916574478149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109187, + "balance_loss_mlp": 1.07431102, + "epoch": 0.44286263947672183, + "flos": 496738220544.0, + "grad_norm": 0.06808021774790461, + "language_loss": 0.82115805, + "learning_rate": 0.0006157682144487832, + "loss": 0.83207679, + "num_input_tokens_seen": 191863040, + "router_z_loss_mlp": 0.17565918, + "step": 2302, + "time_per_iteration": 2.795738458633423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094413, + "balance_loss_mlp": 1.07613826, + "epoch": 0.44305502116198536, + "flos": 609397820928.0, + "grad_norm": 0.0749153625811459, + "language_loss": 0.83107322, + "learning_rate": 0.0006154651154635793, + "loss": 0.84201735, + "num_input_tokens_seen": 191940352, + "router_z_loss_mlp": 0.18273926, + "step": 2303, + "time_per_iteration": 4.31014609336853 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090587, + "balance_loss_mlp": 1.07243156, + "epoch": 0.44324740284724895, + "flos": 470794747392.0, + "grad_norm": 0.07642073153592485, + "language_loss": 0.84451294, + "learning_rate": 0.0006151619716508421, + "loss": 0.8554188, + "num_input_tokens_seen": 192006896, + "router_z_loss_mlp": 0.18164062, + "step": 2304, + "time_per_iteration": 2.6006975173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090647, + "balance_loss_mlp": 1.07205081, + "epoch": 0.4434397845325125, + "flos": 578725171200.0, + "grad_norm": 0.07612741560937177, + "language_loss": 0.87099224, + "learning_rate": 0.0006148587831282625, + "loss": 0.8818987, + "num_input_tokens_seen": 192075312, + "router_z_loss_mlp": 0.18591309, + "step": 2305, + "time_per_iteration": 2.7009835243225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048095, + "balance_loss_mlp": 1.03808129, + "epoch": 0.44363216621777607, + "flos": 1496608939008.0, + "grad_norm": 0.019656861653556033, + "language_loss": 0.79176068, + "learning_rate": 0.0006145555500135483, + "loss": 0.80224162, + "num_input_tokens_seen": 192304816, + "router_z_loss_mlp": 0.10009766, + "step": 2306, + "time_per_iteration": 4.9429686069488525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108493, + "balance_loss_mlp": 1.06604683, + "epoch": 0.44382454790303966, + "flos": 477322647552.0, + "grad_norm": 0.07723488854599227, + "language_loss": 0.87132251, + "learning_rate": 0.0006142522724244255, + "loss": 0.88217181, + "num_input_tokens_seen": 192369232, + "router_z_loss_mlp": 0.18884277, + "step": 2307, + "time_per_iteration": 2.553419828414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_mlp": 1.02589071, + "epoch": 0.4440169295883032, + "flos": 1544115820032.0, + "grad_norm": 0.014915460519873193, + "language_loss": 0.76484716, + "learning_rate": 0.0006139489504786368, + "loss": 0.77520525, + "num_input_tokens_seen": 192600176, + "router_z_loss_mlp": 0.09912109, + "step": 2308, + "time_per_iteration": 4.877639055252075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085887, + "balance_loss_mlp": 1.06711113, + "epoch": 0.4442093112735668, + "flos": 591089011200.0, + "grad_norm": 0.07688151387557987, + "language_loss": 0.77357888, + "learning_rate": 0.000613645584293942, + "loss": 0.78443772, + "num_input_tokens_seen": 192675424, + "router_z_loss_mlp": 0.18762207, + "step": 2309, + "time_per_iteration": 2.9022634029388428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.06968963, + "epoch": 0.4444016929588303, + "flos": 530272313856.0, + "grad_norm": 0.08682478727714991, + "language_loss": 0.83149701, + "learning_rate": 0.0006133421739881185, + "loss": 0.84238064, + "num_input_tokens_seen": 192747552, + "router_z_loss_mlp": 0.18664551, + "step": 2310, + "time_per_iteration": 2.6619491577148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090945, + "balance_loss_mlp": 1.07256329, + "epoch": 0.4445940746440939, + "flos": 620234952192.0, + "grad_norm": 0.08001840232131298, + "language_loss": 0.82499826, + "learning_rate": 0.0006130387196789605, + "loss": 0.8359077, + "num_input_tokens_seen": 192819984, + "router_z_loss_mlp": 0.18359375, + "step": 2311, + "time_per_iteration": 2.761312246322632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081759, + "balance_loss_mlp": 1.06348383, + "epoch": 0.4447864563293574, + "flos": 629100089856.0, + "grad_norm": 0.06942740185124545, + "language_loss": 0.84283984, + "learning_rate": 0.0006127352214842795, + "loss": 0.85365742, + "num_input_tokens_seen": 192906080, + "router_z_loss_mlp": 0.1829834, + "step": 2312, + "time_per_iteration": 2.9890031814575195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083785, + "balance_loss_mlp": 1.06565332, + "epoch": 0.444978838014621, + "flos": 650838592512.0, + "grad_norm": 0.07063181629976649, + "language_loss": 0.85067087, + "learning_rate": 0.0006124316795219041, + "loss": 0.86150873, + "num_input_tokens_seen": 192972336, + "router_z_loss_mlp": 0.18139648, + "step": 2313, + "time_per_iteration": 2.7978243827819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.06714296, + "epoch": 0.44517121969988455, + "flos": 612439501824.0, + "grad_norm": 0.08238507288636325, + "language_loss": 0.82411474, + "learning_rate": 0.0006121280939096794, + "loss": 0.83496892, + "num_input_tokens_seen": 193045744, + "router_z_loss_mlp": 0.1829834, + "step": 2314, + "time_per_iteration": 2.767470121383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087652, + "balance_loss_mlp": 1.06994963, + "epoch": 0.44536360138514813, + "flos": 488735368704.0, + "grad_norm": 0.09711161856626577, + "language_loss": 0.87964773, + "learning_rate": 0.000611824464765468, + "loss": 0.89052415, + "num_input_tokens_seen": 193115248, + "router_z_loss_mlp": 0.17712402, + "step": 2315, + "time_per_iteration": 2.58632493019104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019027, + "balance_loss_mlp": 1.00934732, + "epoch": 0.4455559830704117, + "flos": 1516148969472.0, + "grad_norm": 0.012462298147770837, + "language_loss": 0.78594941, + "learning_rate": 0.0006115207922071492, + "loss": 0.79613966, + "num_input_tokens_seen": 193330816, + "router_z_loss_mlp": 0.09667969, + "step": 2316, + "time_per_iteration": 4.68027400970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097979, + "balance_loss_mlp": 1.08057404, + "epoch": 0.44574836475567525, + "flos": 615614432256.0, + "grad_norm": 0.09030294554601531, + "language_loss": 0.85568595, + "learning_rate": 0.000611217076352619, + "loss": 0.86666572, + "num_input_tokens_seen": 193407616, + "router_z_loss_mlp": 0.17419434, + "step": 2317, + "time_per_iteration": 2.8745946884155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.07860303, + "epoch": 0.44594074644093884, + "flos": 506342306304.0, + "grad_norm": 0.06320933370201777, + "language_loss": 0.83313119, + "learning_rate": 0.0006109133173197905, + "loss": 0.84409374, + "num_input_tokens_seen": 193482624, + "router_z_loss_mlp": 0.17675781, + "step": 2318, + "time_per_iteration": 2.719902515411377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104888, + "balance_loss_mlp": 1.08753085, + "epoch": 0.44613312812620237, + "flos": 726979318272.0, + "grad_norm": 0.07491768608262588, + "language_loss": 0.85073888, + "learning_rate": 0.0006106095152265935, + "loss": 0.86178774, + "num_input_tokens_seen": 193555952, + "router_z_loss_mlp": 0.17370605, + "step": 2319, + "time_per_iteration": 3.004857063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111563, + "balance_loss_mlp": 1.0939796, + "epoch": 0.44632550981146596, + "flos": 635746558464.0, + "grad_norm": 0.08385510801007982, + "language_loss": 0.84405756, + "learning_rate": 0.0006103056701909739, + "loss": 0.85517317, + "num_input_tokens_seen": 193636672, + "router_z_loss_mlp": 0.17602539, + "step": 2320, + "time_per_iteration": 2.966923475265503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113274, + "balance_loss_mlp": 1.09577405, + "epoch": 0.4465178914967295, + "flos": 827074644480.0, + "grad_norm": 0.07685766834781843, + "language_loss": 0.8301264, + "learning_rate": 0.0006100017823308956, + "loss": 0.84125912, + "num_input_tokens_seen": 193721728, + "router_z_loss_mlp": 0.17504883, + "step": 2321, + "time_per_iteration": 3.204850196838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112805, + "balance_loss_mlp": 1.11025262, + "epoch": 0.4467102731819931, + "flos": 665831508480.0, + "grad_norm": 0.08670302679562208, + "language_loss": 0.79305983, + "learning_rate": 0.0006096978517643377, + "loss": 0.80434036, + "num_input_tokens_seen": 193795456, + "router_z_loss_mlp": 0.17797852, + "step": 2322, + "time_per_iteration": 2.860180139541626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112379, + "balance_loss_mlp": 1.10644507, + "epoch": 0.4469026548672566, + "flos": 512946929664.0, + "grad_norm": 0.12580563915967458, + "language_loss": 0.83188093, + "learning_rate": 0.0006093938786092968, + "loss": 0.84311885, + "num_input_tokens_seen": 193865520, + "router_z_loss_mlp": 0.17358398, + "step": 2323, + "time_per_iteration": 2.64030122756958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124017, + "balance_loss_mlp": 1.10691094, + "epoch": 0.4470950365525202, + "flos": 684076078080.0, + "grad_norm": 0.06761406024518349, + "language_loss": 0.89442849, + "learning_rate": 0.0006090898629837857, + "loss": 0.90566862, + "num_input_tokens_seen": 193935040, + "router_z_loss_mlp": 0.17126465, + "step": 2324, + "time_per_iteration": 2.8378353118896484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137759, + "balance_loss_mlp": 1.1204021, + "epoch": 0.4472874182377838, + "flos": 627321189888.0, + "grad_norm": 0.1896235015526922, + "language_loss": 0.87233531, + "learning_rate": 0.0006087858050058337, + "loss": 0.88371289, + "num_input_tokens_seen": 194009120, + "router_z_loss_mlp": 0.17370605, + "step": 2325, + "time_per_iteration": 2.829404830932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131095, + "balance_loss_mlp": 1.1135118, + "epoch": 0.4474797999230473, + "flos": 547204916736.0, + "grad_norm": 0.07181125336629572, + "language_loss": 0.82417965, + "learning_rate": 0.0006084817047934866, + "loss": 0.83549058, + "num_input_tokens_seen": 194076672, + "router_z_loss_mlp": 0.17590332, + "step": 2326, + "time_per_iteration": 2.68251371383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134989, + "balance_loss_mlp": 1.11732209, + "epoch": 0.4476721816083109, + "flos": 455819083776.0, + "grad_norm": 0.08385131470703, + "language_loss": 0.89333081, + "learning_rate": 0.0006081775624648066, + "loss": 0.90468073, + "num_input_tokens_seen": 194142320, + "router_z_loss_mlp": 0.17675781, + "step": 2327, + "time_per_iteration": 2.533090591430664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138131, + "balance_loss_mlp": 1.12101269, + "epoch": 0.44786456329357444, + "flos": 481518079488.0, + "grad_norm": 0.10743629798598615, + "language_loss": 0.82534277, + "learning_rate": 0.0006078733781378721, + "loss": 0.83672416, + "num_input_tokens_seen": 194208560, + "router_z_loss_mlp": 0.17138672, + "step": 2328, + "time_per_iteration": 2.597377061843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111818, + "balance_loss_mlp": 1.10090625, + "epoch": 0.448056944978838, + "flos": 552104418816.0, + "grad_norm": 0.07758231479291984, + "language_loss": 0.82049984, + "learning_rate": 0.0006075691519307781, + "loss": 0.83168161, + "num_input_tokens_seen": 194288080, + "router_z_loss_mlp": 0.17297363, + "step": 2329, + "time_per_iteration": 2.8866052627563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110227, + "balance_loss_mlp": 1.09251261, + "epoch": 0.44824932666410156, + "flos": 550839439872.0, + "grad_norm": 0.0702768888062288, + "language_loss": 0.81606984, + "learning_rate": 0.0006072648839616356, + "loss": 0.82717204, + "num_input_tokens_seen": 194358464, + "router_z_loss_mlp": 0.17724609, + "step": 2330, + "time_per_iteration": 2.7015554904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114358, + "balance_loss_mlp": 1.09686995, + "epoch": 0.44844170834936514, + "flos": 988582454784.0, + "grad_norm": 0.07321658937944422, + "language_loss": 0.82347071, + "learning_rate": 0.0006069605743485718, + "loss": 0.83461428, + "num_input_tokens_seen": 194456112, + "router_z_loss_mlp": 0.1751709, + "step": 2331, + "time_per_iteration": 3.3698229789733887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110303, + "balance_loss_mlp": 1.09319615, + "epoch": 0.44863409003462873, + "flos": 591321378816.0, + "grad_norm": 0.07314304322377065, + "language_loss": 0.83288682, + "learning_rate": 0.0006066562232097303, + "loss": 0.84398985, + "num_input_tokens_seen": 194526880, + "router_z_loss_mlp": 0.17126465, + "step": 2332, + "time_per_iteration": 2.7595224380493164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109542, + "balance_loss_mlp": 1.09135079, + "epoch": 0.44882647171989226, + "flos": 724646850048.0, + "grad_norm": 0.07260034454336384, + "language_loss": 0.86063141, + "learning_rate": 0.0006063518306632708, + "loss": 0.87172687, + "num_input_tokens_seen": 194606800, + "router_z_loss_mlp": 0.18200684, + "step": 2333, + "time_per_iteration": 2.973802089691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110869, + "balance_loss_mlp": 1.09335709, + "epoch": 0.44901885340515585, + "flos": 534927338496.0, + "grad_norm": 0.0724353146925312, + "language_loss": 0.82143402, + "learning_rate": 0.0006060473968273688, + "loss": 0.83254278, + "num_input_tokens_seen": 194679856, + "router_z_loss_mlp": 0.1751709, + "step": 2334, + "time_per_iteration": 2.716792583465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_mlp": 1.02476275, + "epoch": 0.4492112350904194, + "flos": 1555300942848.0, + "grad_norm": 0.01941960869972046, + "language_loss": 0.77879542, + "learning_rate": 0.000605742921820216, + "loss": 0.78913647, + "num_input_tokens_seen": 194906320, + "router_z_loss_mlp": 0.09326172, + "step": 2335, + "time_per_iteration": 4.891199827194214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025292, + "balance_loss_mlp": 1.01608956, + "epoch": 0.44940361677568297, + "flos": 1523358171648.0, + "grad_norm": 0.01646335982957884, + "language_loss": 0.81005216, + "learning_rate": 0.0006054384057600202, + "loss": 0.82030511, + "num_input_tokens_seen": 195129152, + "router_z_loss_mlp": 0.09179688, + "step": 2336, + "time_per_iteration": 4.873430013656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112894, + "balance_loss_mlp": 1.09513164, + "epoch": 0.4495959984609465, + "flos": 382495011840.0, + "grad_norm": 0.18670212144629325, + "language_loss": 0.88409269, + "learning_rate": 0.0006051338487650047, + "loss": 0.89522159, + "num_input_tokens_seen": 195189792, + "router_z_loss_mlp": 0.17785645, + "step": 2337, + "time_per_iteration": 2.4702365398406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106326, + "balance_loss_mlp": 1.08833754, + "epoch": 0.4497883801462101, + "flos": 497879861760.0, + "grad_norm": 0.08397051973497069, + "language_loss": 0.82701272, + "learning_rate": 0.0006048292509534095, + "loss": 0.83807594, + "num_input_tokens_seen": 195258640, + "router_z_loss_mlp": 0.17993164, + "step": 2338, + "time_per_iteration": 2.619450569152832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110586, + "balance_loss_mlp": 1.08850312, + "epoch": 0.4499807618314736, + "flos": 614450769408.0, + "grad_norm": 0.20046859342765924, + "language_loss": 0.77607334, + "learning_rate": 0.0006045246124434895, + "loss": 0.78713191, + "num_input_tokens_seen": 195327984, + "router_z_loss_mlp": 0.17370605, + "step": 2339, + "time_per_iteration": 2.7321267127990723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105214, + "balance_loss_mlp": 1.08761835, + "epoch": 0.4501731435167372, + "flos": 1005510288384.0, + "grad_norm": 0.08075651314496221, + "language_loss": 0.865839, + "learning_rate": 0.0006042199333535162, + "loss": 0.8768912, + "num_input_tokens_seen": 195409504, + "router_z_loss_mlp": 0.17614746, + "step": 2340, + "time_per_iteration": 3.306898832321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100999, + "balance_loss_mlp": 1.08355892, + "epoch": 0.4503655252020008, + "flos": 820880428032.0, + "grad_norm": 0.06823291393488413, + "language_loss": 0.83802176, + "learning_rate": 0.0006039152138017763, + "loss": 0.84903181, + "num_input_tokens_seen": 195489424, + "router_z_loss_mlp": 0.17443848, + "step": 2341, + "time_per_iteration": 3.1458027362823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104806, + "balance_loss_mlp": 1.08727062, + "epoch": 0.4505579068872643, + "flos": 486373165056.0, + "grad_norm": 0.08305826290941032, + "language_loss": 0.83554494, + "learning_rate": 0.0006036104539065726, + "loss": 0.84659296, + "num_input_tokens_seen": 195562128, + "router_z_loss_mlp": 0.17541504, + "step": 2342, + "time_per_iteration": 2.6648519039154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102434, + "balance_loss_mlp": 1.08492208, + "epoch": 0.4507502885725279, + "flos": 884803046400.0, + "grad_norm": 0.06158872344302024, + "language_loss": 0.84248793, + "learning_rate": 0.000603305653786223, + "loss": 0.85351223, + "num_input_tokens_seen": 195646800, + "router_z_loss_mlp": 0.17529297, + "step": 2343, + "time_per_iteration": 3.176680326461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113287, + "balance_loss_mlp": 1.09581113, + "epoch": 0.45094267025779144, + "flos": 578339730432.0, + "grad_norm": 0.0747059506481359, + "language_loss": 0.84228522, + "learning_rate": 0.0006030008135590622, + "loss": 0.85341805, + "num_input_tokens_seen": 195719648, + "router_z_loss_mlp": 0.17480469, + "step": 2344, + "time_per_iteration": 2.742253065109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124309, + "balance_loss_mlp": 1.10722649, + "epoch": 0.45113505194305503, + "flos": 525387492864.0, + "grad_norm": 0.058134829204836994, + "language_loss": 0.799905, + "learning_rate": 0.0006026959333434387, + "loss": 0.81114811, + "num_input_tokens_seen": 195794800, + "router_z_loss_mlp": 0.17102051, + "step": 2345, + "time_per_iteration": 2.779311180114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132846, + "balance_loss_mlp": 1.11552477, + "epoch": 0.45132743362831856, + "flos": 502055470080.0, + "grad_norm": 0.07509063772314063, + "language_loss": 0.77367598, + "learning_rate": 0.0006023910132577181, + "loss": 0.78500438, + "num_input_tokens_seen": 195866848, + "router_z_loss_mlp": 0.17346191, + "step": 2346, + "time_per_iteration": 2.6779799461364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113297, + "balance_loss_mlp": 1.11554205, + "epoch": 0.45151981531358215, + "flos": 431918811648.0, + "grad_norm": 0.10491289793116987, + "language_loss": 0.84559381, + "learning_rate": 0.0006020860534202806, + "loss": 0.85692352, + "num_input_tokens_seen": 195930640, + "router_z_loss_mlp": 0.17443848, + "step": 2347, + "time_per_iteration": 2.528663158416748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135795, + "balance_loss_mlp": 1.1183548, + "epoch": 0.4517121969988457, + "flos": 712159299072.0, + "grad_norm": 0.07098609761882418, + "language_loss": 0.80898821, + "learning_rate": 0.0006017810539495224, + "loss": 0.82034618, + "num_input_tokens_seen": 196014240, + "router_z_loss_mlp": 0.17468262, + "step": 2348, + "time_per_iteration": 2.9910202026367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111241, + "balance_loss_mlp": 1.09382474, + "epoch": 0.45190457868410927, + "flos": 579468888576.0, + "grad_norm": 0.07527105168067424, + "language_loss": 0.82186049, + "learning_rate": 0.0006014760149638547, + "loss": 0.83297288, + "num_input_tokens_seen": 196083296, + "router_z_loss_mlp": 0.17431641, + "step": 2349, + "time_per_iteration": 2.667600631713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124509, + "balance_loss_mlp": 1.10764134, + "epoch": 0.45209696036937286, + "flos": 482657149440.0, + "grad_norm": 0.07463444501983019, + "language_loss": 0.88244182, + "learning_rate": 0.000601170936581704, + "loss": 0.89368689, + "num_input_tokens_seen": 196147840, + "router_z_loss_mlp": 0.16870117, + "step": 2350, + "time_per_iteration": 2.5531952381134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124884, + "balance_loss_mlp": 1.10763478, + "epoch": 0.4522893420546364, + "flos": 540207512064.0, + "grad_norm": 0.07303827993658786, + "language_loss": 0.84088361, + "learning_rate": 0.0006008658189215121, + "loss": 0.85213244, + "num_input_tokens_seen": 196219008, + "router_z_loss_mlp": 0.17260742, + "step": 2351, + "time_per_iteration": 2.6667087078094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122786, + "balance_loss_mlp": 1.10538173, + "epoch": 0.4524817237399, + "flos": 496676551680.0, + "grad_norm": 0.08019313993326724, + "language_loss": 0.80211049, + "learning_rate": 0.0006005606621017366, + "loss": 0.81333834, + "num_input_tokens_seen": 196287792, + "router_z_loss_mlp": 0.17419434, + "step": 2352, + "time_per_iteration": 2.5864298343658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110112, + "balance_loss_mlp": 1.09249294, + "epoch": 0.4526741054251635, + "flos": 652550681088.0, + "grad_norm": 0.08588176709504687, + "language_loss": 0.80108917, + "learning_rate": 0.0006002554662408496, + "loss": 0.81219029, + "num_input_tokens_seen": 196371776, + "router_z_loss_mlp": 0.1763916, + "step": 2353, + "time_per_iteration": 2.921902656555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106203, + "balance_loss_mlp": 1.08839345, + "epoch": 0.4528664871104271, + "flos": 570939632640.0, + "grad_norm": 0.08839686088246723, + "language_loss": 0.91245115, + "learning_rate": 0.0005999502314573388, + "loss": 0.92351323, + "num_input_tokens_seen": 196441840, + "router_z_loss_mlp": 0.17822266, + "step": 2354, + "time_per_iteration": 2.6538503170013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098374, + "balance_loss_mlp": 1.08077872, + "epoch": 0.45305886879569063, + "flos": 458719801344.0, + "grad_norm": 0.07972814176434397, + "language_loss": 0.85777891, + "learning_rate": 0.0005996449578697066, + "loss": 0.86876267, + "num_input_tokens_seen": 196510464, + "router_z_loss_mlp": 0.17590332, + "step": 2355, + "time_per_iteration": 2.6249992847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112931, + "balance_loss_mlp": 1.09541893, + "epoch": 0.4532512504809542, + "flos": 505178643456.0, + "grad_norm": 0.0715197090101731, + "language_loss": 0.81223947, + "learning_rate": 0.0005993396455964709, + "loss": 0.82336879, + "num_input_tokens_seen": 196583888, + "router_z_loss_mlp": 0.17541504, + "step": 2356, + "time_per_iteration": 2.69350266456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111792, + "balance_loss_mlp": 1.0944469, + "epoch": 0.4534436321662178, + "flos": 582213961728.0, + "grad_norm": 0.07234166204840274, + "language_loss": 0.81097758, + "learning_rate": 0.0005990342947561647, + "loss": 0.82209545, + "num_input_tokens_seen": 196652816, + "router_z_loss_mlp": 0.17358398, + "step": 2357, + "time_per_iteration": 2.7173328399658203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123689, + "balance_loss_mlp": 1.10639215, + "epoch": 0.45363601385148133, + "flos": 549720193536.0, + "grad_norm": 0.09230022277941517, + "language_loss": 0.78124547, + "learning_rate": 0.0005987289054673351, + "loss": 0.79248238, + "num_input_tokens_seen": 196720208, + "router_z_loss_mlp": 0.1730957, + "step": 2358, + "time_per_iteration": 2.633007526397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108071, + "balance_loss_mlp": 1.09800935, + "epoch": 0.4538283955367449, + "flos": 1474559520768.0, + "grad_norm": 0.0537090739321762, + "language_loss": 0.76575738, + "learning_rate": 0.0005984234778485451, + "loss": 0.77683806, + "num_input_tokens_seen": 196947696, + "router_z_loss_mlp": 0.10058594, + "step": 2359, + "time_per_iteration": 4.852884769439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011235, + "balance_loss_mlp": 1.10622633, + "epoch": 0.45402077722200845, + "flos": 584711986176.0, + "grad_norm": 0.07905851512069884, + "language_loss": 0.91134411, + "learning_rate": 0.0005981180120183722, + "loss": 0.92257917, + "num_input_tokens_seen": 197015712, + "router_z_loss_mlp": 0.17285156, + "step": 2360, + "time_per_iteration": 2.7044413089752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119068, + "balance_loss_mlp": 1.10053074, + "epoch": 0.45421315890727204, + "flos": 531747265536.0, + "grad_norm": 0.05732939327341075, + "language_loss": 0.85087699, + "learning_rate": 0.0005978125080954089, + "loss": 0.8620677, + "num_input_tokens_seen": 197094880, + "router_z_loss_mlp": 0.18530273, + "step": 2361, + "time_per_iteration": 2.775712251663208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105652, + "balance_loss_mlp": 1.08805668, + "epoch": 0.4544055405925356, + "flos": 785221641216.0, + "grad_norm": 0.0789619101325961, + "language_loss": 0.7727446, + "learning_rate": 0.000597506966198262, + "loss": 0.78380114, + "num_input_tokens_seen": 197176448, + "router_z_loss_mlp": 0.17614746, + "step": 2362, + "time_per_iteration": 2.974111557006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110357, + "balance_loss_mlp": 1.08590329, + "epoch": 0.45459792227779916, + "flos": 518199939072.0, + "grad_norm": 0.0858902108709268, + "language_loss": 0.83994937, + "learning_rate": 0.0005972013864455536, + "loss": 0.85098517, + "num_input_tokens_seen": 197243520, + "router_z_loss_mlp": 0.17675781, + "step": 2363, + "time_per_iteration": 2.6244583129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101233, + "balance_loss_mlp": 1.08366108, + "epoch": 0.4547903039630627, + "flos": 537563755008.0, + "grad_norm": 0.08015454662625561, + "language_loss": 0.851372, + "learning_rate": 0.0005968957689559203, + "loss": 0.86238432, + "num_input_tokens_seen": 197311536, + "router_z_loss_mlp": 0.17602539, + "step": 2364, + "time_per_iteration": 2.6717097759246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098129, + "balance_loss_mlp": 1.08035493, + "epoch": 0.4549826856483263, + "flos": 528676222464.0, + "grad_norm": 0.07229553193462525, + "language_loss": 0.88592815, + "learning_rate": 0.0005965901138480131, + "loss": 0.89690942, + "num_input_tokens_seen": 197382752, + "router_z_loss_mlp": 0.17785645, + "step": 2365, + "time_per_iteration": 2.653158664703369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098766, + "balance_loss_mlp": 1.08063412, + "epoch": 0.45517506733358987, + "flos": 520915276800.0, + "grad_norm": 0.07319480450828385, + "language_loss": 0.87207007, + "learning_rate": 0.0005962844212404982, + "loss": 0.88305777, + "num_input_tokens_seen": 197456592, + "router_z_loss_mlp": 0.18151855, + "step": 2366, + "time_per_iteration": 2.727456569671631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110146, + "balance_loss_mlp": 1.0928843, + "epoch": 0.4553674490188534, + "flos": 451052831232.0, + "grad_norm": 0.06525288256406295, + "language_loss": 0.87264466, + "learning_rate": 0.0005959786912520558, + "loss": 0.88374615, + "num_input_tokens_seen": 197525408, + "router_z_loss_mlp": 0.17285156, + "step": 2367, + "time_per_iteration": 2.6637766361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107553, + "balance_loss_mlp": 1.08999324, + "epoch": 0.455559830704117, + "flos": 546594448896.0, + "grad_norm": 0.061777726879510934, + "language_loss": 0.8370434, + "learning_rate": 0.0005956729240013806, + "loss": 0.84811896, + "num_input_tokens_seen": 197608480, + "router_z_loss_mlp": 0.17565918, + "step": 2368, + "time_per_iteration": 2.815329074859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107603, + "balance_loss_mlp": 1.08967423, + "epoch": 0.4557522123893805, + "flos": 583765636608.0, + "grad_norm": 0.07604266440979088, + "language_loss": 0.91824389, + "learning_rate": 0.0005953671196071824, + "loss": 0.92931986, + "num_input_tokens_seen": 197678416, + "router_z_loss_mlp": 0.17944336, + "step": 2369, + "time_per_iteration": 2.711060047149658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111288, + "balance_loss_mlp": 1.09501028, + "epoch": 0.4559445940746441, + "flos": 526415334912.0, + "grad_norm": 0.06552470471898014, + "language_loss": 0.80047917, + "learning_rate": 0.0005950612781881846, + "loss": 0.81160796, + "num_input_tokens_seen": 197753424, + "router_z_loss_mlp": 0.17871094, + "step": 2370, + "time_per_iteration": 2.710073709487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108328, + "balance_loss_mlp": 1.09072089, + "epoch": 0.45613697575990764, + "flos": 652120823808.0, + "grad_norm": 0.1576706166146413, + "language_loss": 0.75711769, + "learning_rate": 0.0005947553998631259, + "loss": 0.76820099, + "num_input_tokens_seen": 197832080, + "router_z_loss_mlp": 0.1763916, + "step": 2371, + "time_per_iteration": 2.855384588241577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098527, + "balance_loss_mlp": 1.08041906, + "epoch": 0.4563293574451712, + "flos": 867119385600.0, + "grad_norm": 0.056716395855559716, + "language_loss": 0.78911364, + "learning_rate": 0.000594449484750758, + "loss": 0.8000989, + "num_input_tokens_seen": 197919536, + "router_z_loss_mlp": 0.18127441, + "step": 2372, + "time_per_iteration": 4.694324493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095337, + "balance_loss_mlp": 1.07693148, + "epoch": 0.45652173913043476, + "flos": 498079922688.0, + "grad_norm": 0.07402703052898342, + "language_loss": 0.82845718, + "learning_rate": 0.0005941435329698484, + "loss": 0.83941054, + "num_input_tokens_seen": 197991872, + "router_z_loss_mlp": 0.18395996, + "step": 2373, + "time_per_iteration": 2.677161693572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094792, + "balance_loss_mlp": 1.07592094, + "epoch": 0.45671412081569834, + "flos": 560856130560.0, + "grad_norm": 0.07242003224557565, + "language_loss": 0.82777703, + "learning_rate": 0.0005938375446391778, + "loss": 0.83872497, + "num_input_tokens_seen": 198063392, + "router_z_loss_mlp": 0.18847656, + "step": 2374, + "time_per_iteration": 2.6986706256866455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094981, + "balance_loss_mlp": 1.07626557, + "epoch": 0.45690650250096193, + "flos": 503122959360.0, + "grad_norm": 0.09602017850343586, + "language_loss": 0.88724887, + "learning_rate": 0.0005935315198775415, + "loss": 0.89819872, + "num_input_tokens_seen": 198131232, + "router_z_loss_mlp": 0.18713379, + "step": 2375, + "time_per_iteration": 2.6160995960235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097379, + "balance_loss_mlp": 1.07811522, + "epoch": 0.45709888418622546, + "flos": 430698249216.0, + "grad_norm": 0.07644315743317759, + "language_loss": 0.86640108, + "learning_rate": 0.0005932254588037486, + "loss": 0.87737489, + "num_input_tokens_seen": 198194944, + "router_z_loss_mlp": 0.19262695, + "step": 2376, + "time_per_iteration": 2.5169382095336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097089, + "balance_loss_mlp": 1.07751513, + "epoch": 0.45729126587148905, + "flos": 525654365184.0, + "grad_norm": 0.07850584285058836, + "language_loss": 0.86183727, + "learning_rate": 0.000592919361536623, + "loss": 0.87280822, + "num_input_tokens_seen": 198265728, + "router_z_loss_mlp": 0.19580078, + "step": 2377, + "time_per_iteration": 2.668555498123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099166, + "balance_loss_mlp": 1.07996106, + "epoch": 0.4574836475567526, + "flos": 638002676736.0, + "grad_norm": 0.07491389260925961, + "language_loss": 0.89019889, + "learning_rate": 0.0005926132281950017, + "loss": 0.90119052, + "num_input_tokens_seen": 198336640, + "router_z_loss_mlp": 0.19213867, + "step": 2378, + "time_per_iteration": 2.7553632259368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098644, + "balance_loss_mlp": 1.07934439, + "epoch": 0.45767602924201617, + "flos": 649588294656.0, + "grad_norm": 0.07088499852096378, + "language_loss": 0.84996307, + "learning_rate": 0.0005923070588977367, + "loss": 0.86094952, + "num_input_tokens_seen": 198413552, + "router_z_loss_mlp": 0.19287109, + "step": 2379, + "time_per_iteration": 2.8268253803253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105163, + "balance_loss_mlp": 1.08666205, + "epoch": 0.4578684109272797, + "flos": 746676817920.0, + "grad_norm": 0.08663232567685626, + "language_loss": 0.85752875, + "learning_rate": 0.0005920008537636931, + "loss": 0.86858034, + "num_input_tokens_seen": 198490864, + "router_z_loss_mlp": 0.18493652, + "step": 2380, + "time_per_iteration": 2.9154610633850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111966, + "balance_loss_mlp": 1.09322584, + "epoch": 0.4580607926125433, + "flos": 641469072384.0, + "grad_norm": 0.06304298978525442, + "language_loss": 0.86810696, + "learning_rate": 0.0005916946129117504, + "loss": 0.87922657, + "num_input_tokens_seen": 198571200, + "router_z_loss_mlp": 0.18725586, + "step": 2381, + "time_per_iteration": 2.9332261085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116157, + "balance_loss_mlp": 1.09857368, + "epoch": 0.4582531742978069, + "flos": 801857636352.0, + "grad_norm": 0.07662767679861947, + "language_loss": 0.81012738, + "learning_rate": 0.0005913883364608017, + "loss": 0.821289, + "num_input_tokens_seen": 198658624, + "router_z_loss_mlp": 0.17602539, + "step": 2382, + "time_per_iteration": 3.0999624729156494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122668, + "balance_loss_mlp": 1.104954, + "epoch": 0.4584455559830704, + "flos": 684295962624.0, + "grad_norm": 0.07647659587343762, + "language_loss": 0.88500929, + "learning_rate": 0.0005910820245297542, + "loss": 0.89623594, + "num_input_tokens_seen": 198731312, + "router_z_loss_mlp": 0.17724609, + "step": 2383, + "time_per_iteration": 2.8880879878997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124722, + "balance_loss_mlp": 1.10707903, + "epoch": 0.458637937668334, + "flos": 518177544192.0, + "grad_norm": 0.0900951330432027, + "language_loss": 0.80609989, + "learning_rate": 0.000590775677237529, + "loss": 0.81734717, + "num_input_tokens_seen": 198805296, + "router_z_loss_mlp": 0.17651367, + "step": 2384, + "time_per_iteration": 2.758249044418335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133426, + "balance_loss_mlp": 1.11639071, + "epoch": 0.4588303193535975, + "flos": 505499844096.0, + "grad_norm": 0.08076424564900554, + "language_loss": 0.79984713, + "learning_rate": 0.0005904692947030601, + "loss": 0.81118137, + "num_input_tokens_seen": 198872112, + "router_z_loss_mlp": 0.17053223, + "step": 2385, + "time_per_iteration": 2.667224168777466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129309, + "balance_loss_mlp": 1.11242914, + "epoch": 0.4590227010388611, + "flos": 495905670144.0, + "grad_norm": 0.10079326608985974, + "language_loss": 0.89998889, + "learning_rate": 0.0005901628770452963, + "loss": 0.91128194, + "num_input_tokens_seen": 198938480, + "router_z_loss_mlp": 0.16894531, + "step": 2386, + "time_per_iteration": 2.5951790809631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129955, + "balance_loss_mlp": 1.1131345, + "epoch": 0.45921508272412465, + "flos": 493620189696.0, + "grad_norm": 0.06835358350560915, + "language_loss": 0.87016714, + "learning_rate": 0.000589856424383199, + "loss": 0.88146669, + "num_input_tokens_seen": 199008608, + "router_z_loss_mlp": 0.16833496, + "step": 2387, + "time_per_iteration": 2.6031622886657715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112576, + "balance_loss_mlp": 1.1086055, + "epoch": 0.45940746440938823, + "flos": 691394683392.0, + "grad_norm": 0.07768127603303249, + "language_loss": 0.82945853, + "learning_rate": 0.000589549936835744, + "loss": 0.84071612, + "num_input_tokens_seen": 199084592, + "router_z_loss_mlp": 0.17175293, + "step": 2388, + "time_per_iteration": 2.903437376022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112224, + "balance_loss_mlp": 1.10476351, + "epoch": 0.45959984609465176, + "flos": 503738196480.0, + "grad_norm": 0.06100287690428954, + "language_loss": 0.78894806, + "learning_rate": 0.0005892434145219202, + "loss": 0.80017042, + "num_input_tokens_seen": 199151504, + "router_z_loss_mlp": 0.17504883, + "step": 2389, + "time_per_iteration": 2.61372709274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104771, + "balance_loss_mlp": 1.08758104, + "epoch": 0.45979222777991535, + "flos": 676638904320.0, + "grad_norm": 0.07434011004541237, + "language_loss": 0.8214674, + "learning_rate": 0.0005889368575607303, + "loss": 0.83251518, + "num_input_tokens_seen": 199224528, + "router_z_loss_mlp": 0.17211914, + "step": 2390, + "time_per_iteration": 2.894376039505005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113067, + "balance_loss_mlp": 1.09576964, + "epoch": 0.45998460946517894, + "flos": 777653415936.0, + "grad_norm": 0.08125857985315703, + "language_loss": 0.78747576, + "learning_rate": 0.00058863026607119, + "loss": 0.79860646, + "num_input_tokens_seen": 199312512, + "router_z_loss_mlp": 0.1730957, + "step": 2391, + "time_per_iteration": 3.112093210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118062, + "balance_loss_mlp": 1.10093117, + "epoch": 0.46017699115044247, + "flos": 851461673472.0, + "grad_norm": 0.08788037013511367, + "language_loss": 0.7955699, + "learning_rate": 0.0005883236401723287, + "loss": 0.80675054, + "num_input_tokens_seen": 199397216, + "router_z_loss_mlp": 0.17150879, + "step": 2392, + "time_per_iteration": 3.242553472518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09295249, + "epoch": 0.46036937283570606, + "flos": 575878781952.0, + "grad_norm": 0.08816777762822899, + "language_loss": 0.84516722, + "learning_rate": 0.0005880169799831893, + "loss": 0.8562752, + "num_input_tokens_seen": 199464288, + "router_z_loss_mlp": 0.17858887, + "step": 2393, + "time_per_iteration": 2.6654422283172607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098957, + "balance_loss_mlp": 1.08111119, + "epoch": 0.4605617545209696, + "flos": 611866109952.0, + "grad_norm": 0.10997873970116459, + "language_loss": 0.81234348, + "learning_rate": 0.0005877102856228278, + "loss": 0.82333302, + "num_input_tokens_seen": 199538096, + "router_z_loss_mlp": 0.17858887, + "step": 2394, + "time_per_iteration": 2.873918294906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103019, + "balance_loss_mlp": 1.08542323, + "epoch": 0.4607541362062332, + "flos": 533138526720.0, + "grad_norm": 0.07484934817589016, + "language_loss": 0.84600067, + "learning_rate": 0.0005874035572103133, + "loss": 0.85703087, + "num_input_tokens_seen": 199609504, + "router_z_loss_mlp": 0.17602539, + "step": 2395, + "time_per_iteration": 2.6604511737823486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106186, + "balance_loss_mlp": 1.08816206, + "epoch": 0.4609465178914967, + "flos": 647312726016.0, + "grad_norm": 0.09236346174205023, + "language_loss": 0.82285452, + "learning_rate": 0.0005870967948647288, + "loss": 0.83391643, + "num_input_tokens_seen": 199678960, + "router_z_loss_mlp": 0.18041992, + "step": 2396, + "time_per_iteration": 2.805236339569092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088357, + "balance_loss_mlp": 1.0784868, + "epoch": 0.4611388995767603, + "flos": 1466287225344.0, + "grad_norm": 0.0372592343397745, + "language_loss": 0.743083, + "learning_rate": 0.0005867899987051693, + "loss": 0.75396657, + "num_input_tokens_seen": 199903568, + "router_z_loss_mlp": 0.09863281, + "step": 2397, + "time_per_iteration": 5.380864143371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114264, + "balance_loss_mlp": 1.09671664, + "epoch": 0.46133128126202383, + "flos": 723112427520.0, + "grad_norm": 0.08046670019017348, + "language_loss": 0.85787129, + "learning_rate": 0.0005864831688507443, + "loss": 0.86901391, + "num_input_tokens_seen": 199988672, + "router_z_loss_mlp": 0.17553711, + "step": 2398, + "time_per_iteration": 3.1147820949554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119108, + "balance_loss_mlp": 1.10053492, + "epoch": 0.4615236629472874, + "flos": 548010302976.0, + "grad_norm": 0.08636966322347801, + "language_loss": 0.75248241, + "learning_rate": 0.0005861763054205754, + "loss": 0.76367348, + "num_input_tokens_seen": 200062304, + "router_z_loss_mlp": 0.18566895, + "step": 2399, + "time_per_iteration": 2.787648916244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126018, + "balance_loss_mlp": 1.10773087, + "epoch": 0.461716044632551, + "flos": 602244771840.0, + "grad_norm": 0.07252969708721291, + "language_loss": 0.80419457, + "learning_rate": 0.0005858694085337976, + "loss": 0.81545472, + "num_input_tokens_seen": 200138464, + "router_z_loss_mlp": 0.18273926, + "step": 2400, + "time_per_iteration": 2.859846591949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113297, + "balance_loss_mlp": 1.09409237, + "epoch": 0.46190842631781454, + "flos": 474476258304.0, + "grad_norm": 0.08888433146403377, + "language_loss": 0.83730817, + "learning_rate": 0.0005855624783095589, + "loss": 0.84844118, + "num_input_tokens_seen": 200205728, + "router_z_loss_mlp": 0.19189453, + "step": 2401, + "time_per_iteration": 2.5447638034820557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107409, + "balance_loss_mlp": 1.08806109, + "epoch": 0.4621008080030781, + "flos": 437483109888.0, + "grad_norm": 0.06969383703523749, + "language_loss": 0.85055763, + "learning_rate": 0.00058525551486702, + "loss": 0.86163163, + "num_input_tokens_seen": 200269824, + "router_z_loss_mlp": 0.19335938, + "step": 2402, + "time_per_iteration": 2.5561320781707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099434, + "balance_loss_mlp": 1.08090901, + "epoch": 0.46229318968834165, + "flos": 525461644800.0, + "grad_norm": 0.0974904106662223, + "language_loss": 0.80911911, + "learning_rate": 0.0005849485183253548, + "loss": 0.82011348, + "num_input_tokens_seen": 200341264, + "router_z_loss_mlp": 0.18530273, + "step": 2403, + "time_per_iteration": 2.6459126472473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099055, + "balance_loss_mlp": 1.08017266, + "epoch": 0.46248557137360524, + "flos": 439622857728.0, + "grad_norm": 0.06563821415676413, + "language_loss": 0.87331611, + "learning_rate": 0.0005846414888037501, + "loss": 0.88430667, + "num_input_tokens_seen": 200405632, + "router_z_loss_mlp": 0.18896484, + "step": 2404, + "time_per_iteration": 2.5333003997802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091218, + "balance_loss_mlp": 1.07249045, + "epoch": 0.4626779530588688, + "flos": 617608447488.0, + "grad_norm": 0.06903002712252786, + "language_loss": 0.82273191, + "learning_rate": 0.0005843344264214049, + "loss": 0.83364403, + "num_input_tokens_seen": 200479312, + "router_z_loss_mlp": 0.18701172, + "step": 2405, + "time_per_iteration": 2.806748628616333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103621, + "balance_loss_mlp": 1.08491707, + "epoch": 0.46287033474413236, + "flos": 670108432896.0, + "grad_norm": 0.07210099338506677, + "language_loss": 0.84715909, + "learning_rate": 0.0005840273312975317, + "loss": 0.8581953, + "num_input_tokens_seen": 200552976, + "router_z_loss_mlp": 0.18701172, + "step": 2406, + "time_per_iteration": 2.884800910949707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113888, + "balance_loss_mlp": 1.09550619, + "epoch": 0.46306271642939595, + "flos": 480233276928.0, + "grad_norm": 0.08103405236073111, + "language_loss": 0.90235025, + "learning_rate": 0.0005837202035513555, + "loss": 0.9134891, + "num_input_tokens_seen": 200621088, + "router_z_loss_mlp": 0.18383789, + "step": 2407, + "time_per_iteration": 2.609774351119995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114389, + "balance_loss_mlp": 1.09645963, + "epoch": 0.4632550981146595, + "flos": 580686879744.0, + "grad_norm": 0.08825825577707168, + "language_loss": 0.81317043, + "learning_rate": 0.0005834130433021136, + "loss": 0.8243143, + "num_input_tokens_seen": 200698400, + "router_z_loss_mlp": 0.17932129, + "step": 2408, + "time_per_iteration": 2.775449514389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109877, + "balance_loss_mlp": 1.09179258, + "epoch": 0.46344747979992307, + "flos": 523964298240.0, + "grad_norm": 0.07528135433799624, + "language_loss": 0.73480821, + "learning_rate": 0.0005831058506690563, + "loss": 0.74590695, + "num_input_tokens_seen": 200767264, + "router_z_loss_mlp": 0.18078613, + "step": 2409, + "time_per_iteration": 2.675328254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104926, + "balance_loss_mlp": 1.08739018, + "epoch": 0.4636398614851866, + "flos": 746501349888.0, + "grad_norm": 0.06500990989470928, + "language_loss": 0.85772568, + "learning_rate": 0.0005827986257714464, + "loss": 0.86877489, + "num_input_tokens_seen": 200841440, + "router_z_loss_mlp": 0.17541504, + "step": 2410, + "time_per_iteration": 2.934680461883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106367, + "balance_loss_mlp": 1.0885216, + "epoch": 0.4638322431704502, + "flos": 596547224064.0, + "grad_norm": 0.1078033090301908, + "language_loss": 0.88550043, + "learning_rate": 0.0005824913687285591, + "loss": 0.89656413, + "num_input_tokens_seen": 200911296, + "router_z_loss_mlp": 0.17858887, + "step": 2411, + "time_per_iteration": 2.74306058883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101516, + "balance_loss_mlp": 1.08387256, + "epoch": 0.4640246248557137, + "flos": 539443971072.0, + "grad_norm": 0.08594294380237487, + "language_loss": 0.81337988, + "learning_rate": 0.0005821840796596821, + "loss": 0.82439506, + "num_input_tokens_seen": 200981920, + "router_z_loss_mlp": 0.17663574, + "step": 2412, + "time_per_iteration": 2.7274651527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105841, + "balance_loss_mlp": 1.08832955, + "epoch": 0.4642170065409773, + "flos": 562625118720.0, + "grad_norm": 0.05827694326073197, + "language_loss": 0.80418169, + "learning_rate": 0.0005818767586841158, + "loss": 0.81524014, + "num_input_tokens_seen": 201059392, + "router_z_loss_mlp": 0.1751709, + "step": 2413, + "time_per_iteration": 2.779078722000122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109963, + "balance_loss_mlp": 1.09252286, + "epoch": 0.46440938822624084, + "flos": 530959131648.0, + "grad_norm": 0.06834094492641501, + "language_loss": 0.86072665, + "learning_rate": 0.0005815694059211726, + "loss": 0.87182629, + "num_input_tokens_seen": 201130192, + "router_z_loss_mlp": 0.17456055, + "step": 2414, + "time_per_iteration": 2.7060773372650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022324, + "balance_loss_mlp": 1.01297832, + "epoch": 0.4646017699115044, + "flos": 1526325700608.0, + "grad_norm": 0.02599871836797638, + "language_loss": 0.80873632, + "learning_rate": 0.0005812620214901778, + "loss": 0.81895959, + "num_input_tokens_seen": 201354720, + "router_z_loss_mlp": 0.09326172, + "step": 2415, + "time_per_iteration": 4.83809757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018248, + "balance_loss_mlp": 1.00894976, + "epoch": 0.464794151596768, + "flos": 1540831859712.0, + "grad_norm": 0.022144294594628845, + "language_loss": 0.7694506, + "learning_rate": 0.000580954605510468, + "loss": 0.7796331, + "num_input_tokens_seen": 201592096, + "router_z_loss_mlp": 0.09277344, + "step": 2416, + "time_per_iteration": 4.993790626525879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135903, + "balance_loss_mlp": 1.11812854, + "epoch": 0.46498653328203154, + "flos": 501467397120.0, + "grad_norm": 0.10260058932365836, + "language_loss": 0.862611, + "learning_rate": 0.0005806471581013931, + "loss": 0.87397003, + "num_input_tokens_seen": 201666160, + "router_z_loss_mlp": 0.17785645, + "step": 2417, + "time_per_iteration": 2.689473867416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142112, + "balance_loss_mlp": 1.12353921, + "epoch": 0.46517891496729513, + "flos": 676144806912.0, + "grad_norm": 0.08959237751331865, + "language_loss": 0.78271216, + "learning_rate": 0.0005803396793823146, + "loss": 0.79413325, + "num_input_tokens_seen": 201733552, + "router_z_loss_mlp": 0.18579102, + "step": 2418, + "time_per_iteration": 2.8183717727661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126782, + "balance_loss_mlp": 1.10836434, + "epoch": 0.46537129665255866, + "flos": 585351816192.0, + "grad_norm": 0.10270562971795844, + "language_loss": 0.85666251, + "learning_rate": 0.0005800321694726065, + "loss": 0.86793029, + "num_input_tokens_seen": 201806128, + "router_z_loss_mlp": 0.1842041, + "step": 2419, + "time_per_iteration": 2.797482490539551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116623, + "balance_loss_mlp": 1.09855139, + "epoch": 0.46556367833782225, + "flos": 587704108032.0, + "grad_norm": 0.0731094360896604, + "language_loss": 0.86679709, + "learning_rate": 0.0005797246284916545, + "loss": 0.8779633, + "num_input_tokens_seen": 201874224, + "router_z_loss_mlp": 0.1809082, + "step": 2420, + "time_per_iteration": 2.707942008972168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054766, + "balance_loss_mlp": 1.04570651, + "epoch": 0.4657560600230858, + "flos": 1485453551616.0, + "grad_norm": 0.038938158808133214, + "language_loss": 0.77505189, + "learning_rate": 0.0005794170565588569, + "loss": 0.78559953, + "num_input_tokens_seen": 202111648, + "router_z_loss_mlp": 0.09082031, + "step": 2421, + "time_per_iteration": 4.987195253372192 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094388, + "balance_loss_mlp": 1.07617295, + "epoch": 0.46594844170834937, + "flos": 580247110656.0, + "grad_norm": 0.09940681141683862, + "language_loss": 0.87739611, + "learning_rate": 0.0005791094537936233, + "loss": 0.88833994, + "num_input_tokens_seen": 202183344, + "router_z_loss_mlp": 0.18237305, + "step": 2422, + "time_per_iteration": 2.7631046772003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091469, + "balance_loss_mlp": 1.07345629, + "epoch": 0.4661408233936129, + "flos": 512571400704.0, + "grad_norm": 0.06779180589479097, + "language_loss": 0.8166219, + "learning_rate": 0.0005788018203153762, + "loss": 0.82753664, + "num_input_tokens_seen": 202252512, + "router_z_loss_mlp": 0.18017578, + "step": 2423, + "time_per_iteration": 2.6615488529205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085208, + "balance_loss_mlp": 1.06742215, + "epoch": 0.4663332050788765, + "flos": 491077748736.0, + "grad_norm": 0.08426811135055082, + "language_loss": 0.85527384, + "learning_rate": 0.000578494156243549, + "loss": 0.86612594, + "num_input_tokens_seen": 202320096, + "router_z_loss_mlp": 0.17810059, + "step": 2424, + "time_per_iteration": 2.6183924674987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089098, + "balance_loss_mlp": 1.07045364, + "epoch": 0.4665255867641401, + "flos": 512623157760.0, + "grad_norm": 0.08457394710823794, + "language_loss": 0.89275956, + "learning_rate": 0.0005781864616975878, + "loss": 0.90365046, + "num_input_tokens_seen": 202391552, + "router_z_loss_mlp": 0.18640137, + "step": 2425, + "time_per_iteration": 2.6595993041992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096807, + "balance_loss_mlp": 1.07906842, + "epoch": 0.4667179684494036, + "flos": 424812750336.0, + "grad_norm": 0.0955155738973633, + "language_loss": 0.84080482, + "learning_rate": 0.0005778787367969502, + "loss": 0.8517729, + "num_input_tokens_seen": 202457328, + "router_z_loss_mlp": 0.17749023, + "step": 2426, + "time_per_iteration": 2.573312759399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010984, + "balance_loss_mlp": 1.08017302, + "epoch": 0.4669103501346672, + "flos": 707956526592.0, + "grad_norm": 0.07224995984565184, + "language_loss": 0.81008911, + "learning_rate": 0.0005775709816611053, + "loss": 0.82107311, + "num_input_tokens_seen": 202535888, + "router_z_loss_mlp": 0.18237305, + "step": 2427, + "time_per_iteration": 2.9737117290496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096193, + "balance_loss_mlp": 1.07804918, + "epoch": 0.4671027318199307, + "flos": 554832239616.0, + "grad_norm": 0.0630888064205099, + "language_loss": 0.83649611, + "learning_rate": 0.0005772631964095346, + "loss": 0.84745806, + "num_input_tokens_seen": 202608400, + "router_z_loss_mlp": 0.18151855, + "step": 2428, + "time_per_iteration": 2.7121798992156982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108204, + "balance_loss_mlp": 1.09003639, + "epoch": 0.4672951135051943, + "flos": 567109817856.0, + "grad_norm": 0.07098479359046088, + "language_loss": 0.85673976, + "learning_rate": 0.000576955381161731, + "loss": 0.86782181, + "num_input_tokens_seen": 202677712, + "router_z_loss_mlp": 0.18164062, + "step": 2429, + "time_per_iteration": 2.7059943675994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102277, + "balance_loss_mlp": 1.08414483, + "epoch": 0.46748749519045785, + "flos": 424518713856.0, + "grad_norm": 0.07900180679196234, + "language_loss": 0.86017609, + "learning_rate": 0.0005766475360371985, + "loss": 0.87119883, + "num_input_tokens_seen": 202743824, + "router_z_loss_mlp": 0.18115234, + "step": 2430, + "time_per_iteration": 2.5818653106689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106318, + "balance_loss_mlp": 1.08826935, + "epoch": 0.46767987687572143, + "flos": 538344548352.0, + "grad_norm": 0.07907770586360956, + "language_loss": 0.8455205, + "learning_rate": 0.0005763396611554536, + "loss": 0.85658371, + "num_input_tokens_seen": 202813072, + "router_z_loss_mlp": 0.18066406, + "step": 2431, + "time_per_iteration": 2.6773664951324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109521, + "balance_loss_mlp": 1.09193754, + "epoch": 0.467872258560985, + "flos": 823702224384.0, + "grad_norm": 0.09111480047327246, + "language_loss": 0.79973984, + "learning_rate": 0.0005760317566360237, + "loss": 0.81083506, + "num_input_tokens_seen": 202886576, + "router_z_loss_mlp": 0.17602539, + "step": 2432, + "time_per_iteration": 3.014580726623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114014, + "balance_loss_mlp": 1.09622765, + "epoch": 0.46806464024624855, + "flos": 661663240704.0, + "grad_norm": 0.0789075933194326, + "language_loss": 0.85020924, + "learning_rate": 0.000575723822598448, + "loss": 0.86134946, + "num_input_tokens_seen": 202956736, + "router_z_loss_mlp": 0.17785645, + "step": 2433, + "time_per_iteration": 2.8005478382110596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111562, + "balance_loss_mlp": 1.09765542, + "epoch": 0.46825702193151214, + "flos": 755700171264.0, + "grad_norm": 0.07367233066443238, + "language_loss": 0.8147794, + "learning_rate": 0.0005754158591622773, + "loss": 0.82593554, + "num_input_tokens_seen": 203036432, + "router_z_loss_mlp": 0.1796875, + "step": 2434, + "time_per_iteration": 3.0118775367736816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011158, + "balance_loss_mlp": 1.09752536, + "epoch": 0.4684494036167757, + "flos": 439393061376.0, + "grad_norm": 0.07922373152064655, + "language_loss": 0.82327235, + "learning_rate": 0.0005751078664470732, + "loss": 0.83443034, + "num_input_tokens_seen": 203101904, + "router_z_loss_mlp": 0.18286133, + "step": 2435, + "time_per_iteration": 2.5390684604644775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116733, + "balance_loss_mlp": 1.09935236, + "epoch": 0.46864178530203926, + "flos": 532706098176.0, + "grad_norm": 0.07859313369065737, + "language_loss": 0.85868919, + "learning_rate": 0.0005747998445724094, + "loss": 0.86985648, + "num_input_tokens_seen": 203170272, + "router_z_loss_mlp": 0.17382812, + "step": 2436, + "time_per_iteration": 2.6606297492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112946, + "balance_loss_mlp": 1.11235368, + "epoch": 0.4688341669873028, + "flos": 576627268608.0, + "grad_norm": 0.10622400322266522, + "language_loss": 0.8919673, + "learning_rate": 0.0005744917936578707, + "loss": 0.90326178, + "num_input_tokens_seen": 203243920, + "router_z_loss_mlp": 0.17126465, + "step": 2437, + "time_per_iteration": 2.8204565048217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121886, + "balance_loss_mlp": 1.10436273, + "epoch": 0.4690265486725664, + "flos": 539579791872.0, + "grad_norm": 0.06508472909978535, + "language_loss": 0.8377744, + "learning_rate": 0.0005741837138230526, + "loss": 0.8489933, + "num_input_tokens_seen": 203321760, + "router_z_loss_mlp": 0.17553711, + "step": 2438, + "time_per_iteration": 2.781350612640381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122059, + "balance_loss_mlp": 1.10464203, + "epoch": 0.4692189303578299, + "flos": 770510278656.0, + "grad_norm": 0.06834159619761165, + "language_loss": 0.86276829, + "learning_rate": 0.0005738756051875627, + "loss": 0.87398893, + "num_input_tokens_seen": 203409088, + "router_z_loss_mlp": 0.17431641, + "step": 2439, + "time_per_iteration": 3.121708631515503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131074, + "balance_loss_mlp": 1.11383653, + "epoch": 0.4694113120430935, + "flos": 571396654080.0, + "grad_norm": 0.07303953933220877, + "language_loss": 0.82923281, + "learning_rate": 0.0005735674678710192, + "loss": 0.84054363, + "num_input_tokens_seen": 203481680, + "router_z_loss_mlp": 0.17260742, + "step": 2440, + "time_per_iteration": 2.749302864074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122683, + "balance_loss_mlp": 1.1051836, + "epoch": 0.4696036937283571, + "flos": 748816565760.0, + "grad_norm": 0.1547549936477752, + "language_loss": 0.80928504, + "learning_rate": 0.0005732593019930517, + "loss": 0.82051194, + "num_input_tokens_seen": 203554848, + "router_z_loss_mlp": 0.17504883, + "step": 2441, + "time_per_iteration": 2.9091122150421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137766, + "balance_loss_mlp": 1.12098181, + "epoch": 0.4697960754136206, + "flos": 493454633472.0, + "grad_norm": 0.0743256165664551, + "language_loss": 0.87914228, + "learning_rate": 0.0005729511076733008, + "loss": 0.89051992, + "num_input_tokens_seen": 203624816, + "router_z_loss_mlp": 0.16796875, + "step": 2442, + "time_per_iteration": 2.728706121444702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140336, + "balance_loss_mlp": 1.12288404, + "epoch": 0.4699884570988842, + "flos": 725118925824.0, + "grad_norm": 0.07419109808583535, + "language_loss": 0.84796697, + "learning_rate": 0.000572642885031418, + "loss": 0.85937035, + "num_input_tokens_seen": 203698256, + "router_z_loss_mlp": 0.17456055, + "step": 2443, + "time_per_iteration": 2.8746440410614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134354, + "balance_loss_mlp": 1.11715245, + "epoch": 0.47018083878414774, + "flos": 555427653120.0, + "grad_norm": 0.10756822588652355, + "language_loss": 0.80578518, + "learning_rate": 0.0005723346341870662, + "loss": 0.81712866, + "num_input_tokens_seen": 203772672, + "router_z_loss_mlp": 0.17224121, + "step": 2444, + "time_per_iteration": 2.740504741668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114197, + "balance_loss_mlp": 1.12406492, + "epoch": 0.4703732204694113, + "flos": 424069032960.0, + "grad_norm": 0.12204296392179416, + "language_loss": 0.86163437, + "learning_rate": 0.0005720263552599188, + "loss": 0.87305409, + "num_input_tokens_seen": 203835904, + "router_z_loss_mlp": 0.17907715, + "step": 2445, + "time_per_iteration": 2.489807367324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112646, + "balance_loss_mlp": 1.10888886, + "epoch": 0.47056560215467486, + "flos": 703494222336.0, + "grad_norm": 0.08439630255123334, + "language_loss": 0.79720879, + "learning_rate": 0.0005717180483696604, + "loss": 0.80847341, + "num_input_tokens_seen": 203914704, + "router_z_loss_mlp": 0.17590332, + "step": 2446, + "time_per_iteration": 2.9626049995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113574, + "balance_loss_mlp": 1.09573984, + "epoch": 0.47075798383993844, + "flos": 554963291136.0, + "grad_norm": 0.0764291785045912, + "language_loss": 0.83012414, + "learning_rate": 0.0005714097136359862, + "loss": 0.84125984, + "num_input_tokens_seen": 203985072, + "router_z_loss_mlp": 0.17822266, + "step": 2447, + "time_per_iteration": 2.6736068725585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105721, + "balance_loss_mlp": 1.08789945, + "epoch": 0.470950365525202, + "flos": 564305273856.0, + "grad_norm": 0.08513203657143086, + "language_loss": 0.86345923, + "learning_rate": 0.0005711013511786027, + "loss": 0.87451649, + "num_input_tokens_seen": 204061904, + "router_z_loss_mlp": 0.1784668, + "step": 2448, + "time_per_iteration": 2.7899038791656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096309, + "balance_loss_mlp": 1.07914329, + "epoch": 0.47114274721046556, + "flos": 534450493440.0, + "grad_norm": 0.06769719009727464, + "language_loss": 0.83320636, + "learning_rate": 0.0005707929611172263, + "loss": 0.8441695, + "num_input_tokens_seen": 204137392, + "router_z_loss_mlp": 0.171875, + "step": 2449, + "time_per_iteration": 2.7302591800689697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094917, + "balance_loss_mlp": 1.07738137, + "epoch": 0.47133512889572915, + "flos": 473117303808.0, + "grad_norm": 0.0952592580133139, + "language_loss": 0.83792615, + "learning_rate": 0.000570484543571585, + "loss": 0.84887528, + "num_input_tokens_seen": 204202752, + "router_z_loss_mlp": 0.17553711, + "step": 2450, + "time_per_iteration": 2.553699254989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091615, + "balance_loss_mlp": 1.07405567, + "epoch": 0.4715275105809927, + "flos": 459013837824.0, + "grad_norm": 0.09253179962645706, + "language_loss": 0.82604945, + "learning_rate": 0.0005701760986614171, + "loss": 0.83696556, + "num_input_tokens_seen": 204266960, + "router_z_loss_mlp": 0.17578125, + "step": 2451, + "time_per_iteration": 2.5708320140838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084195, + "balance_loss_mlp": 1.06648016, + "epoch": 0.47171989226625627, + "flos": 422003437056.0, + "grad_norm": 0.09280751659958478, + "language_loss": 0.87434494, + "learning_rate": 0.0005698676265064714, + "loss": 0.88518691, + "num_input_tokens_seen": 204331216, + "router_z_loss_mlp": 0.17736816, + "step": 2452, + "time_per_iteration": 2.505521297454834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.06540704, + "epoch": 0.4719122739515198, + "flos": 457434998784.0, + "grad_norm": 0.08307061480415358, + "language_loss": 0.88798922, + "learning_rate": 0.0005695591272265074, + "loss": 0.89882344, + "num_input_tokens_seen": 204397216, + "router_z_loss_mlp": 0.18017578, + "step": 2453, + "time_per_iteration": 2.5634660720825195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091392, + "balance_loss_mlp": 1.07360613, + "epoch": 0.4721046556367834, + "flos": 514975449600.0, + "grad_norm": 0.09129518334944925, + "language_loss": 0.81819969, + "learning_rate": 0.0005692506009412954, + "loss": 0.8291136, + "num_input_tokens_seen": 204469952, + "router_z_loss_mlp": 0.17797852, + "step": 2454, + "time_per_iteration": 2.740715980529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094643, + "balance_loss_mlp": 1.08458209, + "epoch": 0.4722970373220469, + "flos": 1572258138624.0, + "grad_norm": 0.045004720534391626, + "language_loss": 0.7755127, + "learning_rate": 0.0005689420477706156, + "loss": 0.78645909, + "num_input_tokens_seen": 204701152, + "router_z_loss_mlp": 0.10058594, + "step": 2455, + "time_per_iteration": 4.978295803070068 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110787, + "balance_loss_mlp": 1.08972645, + "epoch": 0.4724894190073105, + "flos": 586214102016.0, + "grad_norm": 0.07943806135548723, + "language_loss": 0.89481127, + "learning_rate": 0.0005686334678342593, + "loss": 0.90588999, + "num_input_tokens_seen": 204778144, + "router_z_loss_mlp": 0.18151855, + "step": 2456, + "time_per_iteration": 2.9444401264190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124124, + "balance_loss_mlp": 1.10643291, + "epoch": 0.4726818006925741, + "flos": 867645789696.0, + "grad_norm": 0.08486852653668125, + "language_loss": 0.81272578, + "learning_rate": 0.0005683248612520274, + "loss": 0.8239671, + "num_input_tokens_seen": 204853376, + "router_z_loss_mlp": 0.17700195, + "step": 2457, + "time_per_iteration": 3.1061813831329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113744, + "balance_loss_mlp": 1.11931992, + "epoch": 0.4728741823778376, + "flos": 752967581184.0, + "grad_norm": 0.11516736159890015, + "language_loss": 0.83477956, + "learning_rate": 0.0005680162281437321, + "loss": 0.84615391, + "num_input_tokens_seen": 204925280, + "router_z_loss_mlp": 0.18115234, + "step": 2458, + "time_per_iteration": 2.929063558578491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148235, + "balance_loss_mlp": 1.13042545, + "epoch": 0.4730665640631012, + "flos": 538571773440.0, + "grad_norm": 0.07751254840004482, + "language_loss": 0.84309924, + "learning_rate": 0.000567707568629195, + "loss": 0.85458159, + "num_input_tokens_seen": 205000592, + "router_z_loss_mlp": 0.17810059, + "step": 2459, + "time_per_iteration": 2.7221994400024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147022, + "balance_loss_mlp": 1.12910485, + "epoch": 0.47325894574836475, + "flos": 491653338624.0, + "grad_norm": 0.08725044616859287, + "language_loss": 0.81842762, + "learning_rate": 0.0005673988828282486, + "loss": 0.82989782, + "num_input_tokens_seen": 205073968, + "router_z_loss_mlp": 0.17932129, + "step": 2460, + "time_per_iteration": 2.7002882957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137284, + "balance_loss_mlp": 1.11850882, + "epoch": 0.47345132743362833, + "flos": 764459223552.0, + "grad_norm": 0.08215342810100013, + "language_loss": 0.80515504, + "learning_rate": 0.0005670901708607352, + "loss": 0.8165279, + "num_input_tokens_seen": 205153536, + "router_z_loss_mlp": 0.1875, + "step": 2461, + "time_per_iteration": 2.9950685501098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118533, + "balance_loss_mlp": 1.09990108, + "epoch": 0.47364370911889186, + "flos": 540173007360.0, + "grad_norm": 0.10884730986404606, + "language_loss": 0.83628744, + "learning_rate": 0.0005667814328465076, + "loss": 0.84747279, + "num_input_tokens_seen": 205220944, + "router_z_loss_mlp": 0.1862793, + "step": 2462, + "time_per_iteration": 2.645465612411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108626, + "balance_loss_mlp": 1.09035087, + "epoch": 0.47383609080415545, + "flos": 406219815936.0, + "grad_norm": 0.09091581525952792, + "language_loss": 0.81654978, + "learning_rate": 0.0005664726689054285, + "loss": 0.82763606, + "num_input_tokens_seen": 205282688, + "router_z_loss_mlp": 0.18261719, + "step": 2463, + "time_per_iteration": 2.4545066356658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104041, + "balance_loss_mlp": 1.08579004, + "epoch": 0.474028472489419, + "flos": 453476703744.0, + "grad_norm": 0.07864824239143242, + "language_loss": 0.80990708, + "learning_rate": 0.0005661638791573704, + "loss": 0.82094747, + "num_input_tokens_seen": 205357360, + "router_z_loss_mlp": 0.18237305, + "step": 2464, + "time_per_iteration": 2.734745502471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108445, + "balance_loss_mlp": 1.08969331, + "epoch": 0.47422085417468257, + "flos": 492177171456.0, + "grad_norm": 0.0786760499807007, + "language_loss": 0.86728454, + "learning_rate": 0.0005658550637222164, + "loss": 0.87836903, + "num_input_tokens_seen": 205424352, + "router_z_loss_mlp": 0.1875, + "step": 2465, + "time_per_iteration": 2.6243197917938232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098109, + "balance_loss_mlp": 1.07942867, + "epoch": 0.47441323585994616, + "flos": 738854203392.0, + "grad_norm": 0.07656108123336647, + "language_loss": 0.82025492, + "learning_rate": 0.0005655462227198592, + "loss": 0.831236, + "num_input_tokens_seen": 205502912, + "router_z_loss_mlp": 0.18676758, + "step": 2466, + "time_per_iteration": 2.9340949058532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090686, + "balance_loss_mlp": 1.0713619, + "epoch": 0.4746056175452097, + "flos": 484685669376.0, + "grad_norm": 0.08929128939464244, + "language_loss": 0.84165299, + "learning_rate": 0.0005652373562702016, + "loss": 0.8525598, + "num_input_tokens_seen": 205571168, + "router_z_loss_mlp": 0.19311523, + "step": 2467, + "time_per_iteration": 2.6669704914093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088642, + "balance_loss_mlp": 1.07042646, + "epoch": 0.4747979992304733, + "flos": 461052269568.0, + "grad_norm": 0.09740211929478898, + "language_loss": 0.88243479, + "learning_rate": 0.000564928464493156, + "loss": 0.89332116, + "num_input_tokens_seen": 205639648, + "router_z_loss_mlp": 0.18212891, + "step": 2468, + "time_per_iteration": 2.5501999855041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_mlp": 1.06571448, + "epoch": 0.4749903809157368, + "flos": 864431212032.0, + "grad_norm": 0.10206964777214489, + "language_loss": 0.8130033, + "learning_rate": 0.000564619547508645, + "loss": 0.82383919, + "num_input_tokens_seen": 205721536, + "router_z_loss_mlp": 0.17907715, + "step": 2469, + "time_per_iteration": 3.1110846996307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080615, + "balance_loss_mlp": 1.0618155, + "epoch": 0.4751827626010004, + "flos": 505546831872.0, + "grad_norm": 0.10847559686300064, + "language_loss": 0.83074248, + "learning_rate": 0.0005643106054366008, + "loss": 0.84154862, + "num_input_tokens_seen": 205788512, + "router_z_loss_mlp": 0.18798828, + "step": 2470, + "time_per_iteration": 2.5955324172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082834, + "balance_loss_mlp": 1.0653584, + "epoch": 0.47537514428626393, + "flos": 559388519424.0, + "grad_norm": 0.07776069310312227, + "language_loss": 0.78943384, + "learning_rate": 0.000564001638396965, + "loss": 0.80026221, + "num_input_tokens_seen": 205863104, + "router_z_loss_mlp": 0.17492676, + "step": 2471, + "time_per_iteration": 2.7306296825408936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090328, + "balance_loss_mlp": 1.07253027, + "epoch": 0.4755675259715275, + "flos": 834260000256.0, + "grad_norm": 0.0797482134953605, + "language_loss": 0.81547666, + "learning_rate": 0.0005636926465096897, + "loss": 0.8263799, + "num_input_tokens_seen": 205940688, + "router_z_loss_mlp": 0.17810059, + "step": 2472, + "time_per_iteration": 3.059279203414917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112, + "balance_loss_mlp": 1.09371316, + "epoch": 0.47575990765679105, + "flos": 508237576704.0, + "grad_norm": 0.08460495515925144, + "language_loss": 0.87285447, + "learning_rate": 0.0005633836298947363, + "loss": 0.88397449, + "num_input_tokens_seen": 206008352, + "router_z_loss_mlp": 0.18286133, + "step": 2473, + "time_per_iteration": 2.6521553993225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122658, + "balance_loss_mlp": 1.10413289, + "epoch": 0.47595228934205464, + "flos": 591845211648.0, + "grad_norm": 0.09203669339342216, + "language_loss": 0.70590854, + "learning_rate": 0.000563074588672075, + "loss": 0.71713507, + "num_input_tokens_seen": 206078240, + "router_z_loss_mlp": 0.18530273, + "step": 2474, + "time_per_iteration": 2.7375221252441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113642, + "balance_loss_mlp": 1.11839581, + "epoch": 0.4761446710273182, + "flos": 580607958528.0, + "grad_norm": 0.0857314817059495, + "language_loss": 0.8500272, + "learning_rate": 0.0005627655229616868, + "loss": 0.86139143, + "num_input_tokens_seen": 206148896, + "router_z_loss_mlp": 0.18029785, + "step": 2475, + "time_per_iteration": 2.7078299522399902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128625, + "balance_loss_mlp": 1.11030293, + "epoch": 0.47633705271258175, + "flos": 672893153280.0, + "grad_norm": 0.07963853645873449, + "language_loss": 0.89927155, + "learning_rate": 0.0005624564328835616, + "loss": 0.91055775, + "num_input_tokens_seen": 206223792, + "router_z_loss_mlp": 0.18334961, + "step": 2476, + "time_per_iteration": 2.8388264179229736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117766, + "balance_loss_mlp": 1.09916914, + "epoch": 0.47652943439784534, + "flos": 541857931776.0, + "grad_norm": 0.07471116365669703, + "language_loss": 0.83945388, + "learning_rate": 0.0005621473185576986, + "loss": 0.85063154, + "num_input_tokens_seen": 206299376, + "router_z_loss_mlp": 0.18579102, + "step": 2477, + "time_per_iteration": 2.7755634784698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112627, + "balance_loss_mlp": 1.09451878, + "epoch": 0.4767218160831089, + "flos": 524819243520.0, + "grad_norm": 0.10765434361010802, + "language_loss": 0.87517297, + "learning_rate": 0.0005618381801041068, + "loss": 0.88629925, + "num_input_tokens_seen": 206367936, + "router_z_loss_mlp": 0.18115234, + "step": 2478, + "time_per_iteration": 2.6078171730041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110924, + "balance_loss_mlp": 1.0912751, + "epoch": 0.47691419776837246, + "flos": 568056167424.0, + "grad_norm": 0.09054531696498577, + "language_loss": 0.8286736, + "learning_rate": 0.0005615290176428044, + "loss": 0.83976603, + "num_input_tokens_seen": 206438864, + "router_z_loss_mlp": 0.17980957, + "step": 2479, + "time_per_iteration": 2.658313035964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093873, + "balance_loss_mlp": 1.07611132, + "epoch": 0.477106579453636, + "flos": 530931967488.0, + "grad_norm": 0.07218164617984826, + "language_loss": 0.85039639, + "learning_rate": 0.0005612198312938187, + "loss": 0.8613351, + "num_input_tokens_seen": 206516656, + "router_z_loss_mlp": 0.17773438, + "step": 2480, + "time_per_iteration": 2.7423031330108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07839966, + "epoch": 0.4772989611388996, + "flos": 594283765248.0, + "grad_norm": 0.08183869789897112, + "language_loss": 0.79371572, + "learning_rate": 0.0005609106211771868, + "loss": 0.80467397, + "num_input_tokens_seen": 206595040, + "router_z_loss_mlp": 0.17443848, + "step": 2481, + "time_per_iteration": 2.888284921646118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098997, + "balance_loss_mlp": 1.08134174, + "epoch": 0.4774913428241631, + "flos": 544622828544.0, + "grad_norm": 0.07799032438633784, + "language_loss": 0.89138782, + "learning_rate": 0.0005606013874129543, + "loss": 0.90237772, + "num_input_tokens_seen": 206670192, + "router_z_loss_mlp": 0.17675781, + "step": 2482, + "time_per_iteration": 2.8308520317077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096101, + "balance_loss_mlp": 1.07892263, + "epoch": 0.4776837245094267, + "flos": 540079031808.0, + "grad_norm": 0.06912495328146803, + "language_loss": 0.79914749, + "learning_rate": 0.0005602921301211768, + "loss": 0.81010854, + "num_input_tokens_seen": 206746992, + "router_z_loss_mlp": 0.17199707, + "step": 2483, + "time_per_iteration": 2.745229721069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092062, + "balance_loss_mlp": 1.07441866, + "epoch": 0.4778761061946903, + "flos": 471785513472.0, + "grad_norm": 0.08947954354315603, + "language_loss": 0.8218801, + "learning_rate": 0.0005599828494219185, + "loss": 0.83280063, + "num_input_tokens_seen": 206813584, + "router_z_loss_mlp": 0.1763916, + "step": 2484, + "time_per_iteration": 2.5549302101135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.07945359, + "epoch": 0.4780684878799538, + "flos": 726082527744.0, + "grad_norm": 0.09532235552733567, + "language_loss": 0.8879438, + "learning_rate": 0.0005596735454352527, + "loss": 0.89891142, + "num_input_tokens_seen": 206885840, + "router_z_loss_mlp": 0.17333984, + "step": 2485, + "time_per_iteration": 2.8665127754211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094881, + "balance_loss_mlp": 1.07777441, + "epoch": 0.4782608695652174, + "flos": 548922147840.0, + "grad_norm": 0.09434748219243295, + "language_loss": 0.85316986, + "learning_rate": 0.0005593642182812619, + "loss": 0.8641187, + "num_input_tokens_seen": 206955104, + "router_z_loss_mlp": 0.17126465, + "step": 2486, + "time_per_iteration": 2.6778790950775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094993, + "balance_loss_mlp": 1.07798147, + "epoch": 0.47845325125048094, + "flos": 829923604992.0, + "grad_norm": 0.07207308279854807, + "language_loss": 0.83091319, + "learning_rate": 0.0005590548680800378, + "loss": 0.84186316, + "num_input_tokens_seen": 207039792, + "router_z_loss_mlp": 0.17028809, + "step": 2487, + "time_per_iteration": 3.121678590774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100078, + "balance_loss_mlp": 1.08330488, + "epoch": 0.4786456329357445, + "flos": 514164920832.0, + "grad_norm": 0.0688175569320757, + "language_loss": 0.76333058, + "learning_rate": 0.0005587454949516804, + "loss": 0.77433127, + "num_input_tokens_seen": 207115632, + "router_z_loss_mlp": 0.16784668, + "step": 2488, + "time_per_iteration": 2.7487144470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109664, + "balance_loss_mlp": 1.09223557, + "epoch": 0.47883801462100806, + "flos": 564658781184.0, + "grad_norm": 0.0791895688664035, + "language_loss": 0.87661278, + "learning_rate": 0.0005584360990162993, + "loss": 0.88770944, + "num_input_tokens_seen": 207184336, + "router_z_loss_mlp": 0.17443848, + "step": 2489, + "time_per_iteration": 2.6889615058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.08878708, + "epoch": 0.47903039630627164, + "flos": 579577545216.0, + "grad_norm": 0.06381910852284944, + "language_loss": 0.85160542, + "learning_rate": 0.0005581266803940124, + "loss": 0.8626619, + "num_input_tokens_seen": 207258720, + "router_z_loss_mlp": 0.16870117, + "step": 2490, + "time_per_iteration": 2.752704381942749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108064, + "balance_loss_mlp": 1.09077895, + "epoch": 0.47922277799153523, + "flos": 618950149632.0, + "grad_norm": 0.06997425176776657, + "language_loss": 0.87046134, + "learning_rate": 0.0005578172392049471, + "loss": 0.88154197, + "num_input_tokens_seen": 207329216, + "router_z_loss_mlp": 0.17297363, + "step": 2491, + "time_per_iteration": 2.744326114654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113808, + "balance_loss_mlp": 1.09704673, + "epoch": 0.47941515967679876, + "flos": 639653096448.0, + "grad_norm": 0.0919919864780235, + "language_loss": 0.84245729, + "learning_rate": 0.0005575077755692386, + "loss": 0.85359544, + "num_input_tokens_seen": 207403712, + "router_z_loss_mlp": 0.16760254, + "step": 2492, + "time_per_iteration": 2.829349994659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106621, + "balance_loss_mlp": 1.08978891, + "epoch": 0.47960754136206235, + "flos": 519823194624.0, + "grad_norm": 0.07193820952165939, + "language_loss": 0.85866803, + "learning_rate": 0.0005571982896070316, + "loss": 0.86973423, + "num_input_tokens_seen": 207477120, + "router_z_loss_mlp": 0.16845703, + "step": 2493, + "time_per_iteration": 2.6917920112609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111679, + "balance_loss_mlp": 1.09457207, + "epoch": 0.4797999230473259, + "flos": 475044507648.0, + "grad_norm": 0.08033850408937983, + "language_loss": 0.89604986, + "learning_rate": 0.0005568887814384792, + "loss": 0.9071666, + "num_input_tokens_seen": 207544592, + "router_z_loss_mlp": 0.17114258, + "step": 2494, + "time_per_iteration": 2.569196939468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106396, + "balance_loss_mlp": 1.08963561, + "epoch": 0.47999230473258947, + "flos": 532026620928.0, + "grad_norm": 0.07662616215624289, + "language_loss": 0.87274265, + "learning_rate": 0.000556579251183743, + "loss": 0.88380659, + "num_input_tokens_seen": 207613808, + "router_z_loss_mlp": 0.16772461, + "step": 2495, + "time_per_iteration": 4.119016408920288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109667, + "balance_loss_mlp": 1.09276271, + "epoch": 0.480184686417853, + "flos": 601486373376.0, + "grad_norm": 0.07795098880988466, + "language_loss": 0.79870969, + "learning_rate": 0.0005562696989629936, + "loss": 0.80980641, + "num_input_tokens_seen": 207684464, + "router_z_loss_mlp": 0.16918945, + "step": 2496, + "time_per_iteration": 2.780027151107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112455, + "balance_loss_mlp": 1.09557533, + "epoch": 0.4803770681031166, + "flos": 528196806144.0, + "grad_norm": 0.068284016634177, + "language_loss": 0.82789242, + "learning_rate": 0.0005559601248964095, + "loss": 0.83901697, + "num_input_tokens_seen": 207754016, + "router_z_loss_mlp": 0.16894531, + "step": 2497, + "time_per_iteration": 2.653590202331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110865, + "balance_loss_mlp": 1.09190154, + "epoch": 0.4805694497883801, + "flos": 511192622592.0, + "grad_norm": 0.10697304585744172, + "language_loss": 0.85506153, + "learning_rate": 0.0005556505291041783, + "loss": 0.86614799, + "num_input_tokens_seen": 207827104, + "router_z_loss_mlp": 0.16760254, + "step": 2498, + "time_per_iteration": 2.720294952392578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106549, + "balance_loss_mlp": 1.08972836, + "epoch": 0.4807618314736437, + "flos": 600342160896.0, + "grad_norm": 0.0621998173583794, + "language_loss": 0.84237647, + "learning_rate": 0.0005553409117064954, + "loss": 0.85344195, + "num_input_tokens_seen": 207907824, + "router_z_loss_mlp": 0.16833496, + "step": 2499, + "time_per_iteration": 2.9154043197631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119945, + "balance_loss_mlp": 1.10298109, + "epoch": 0.4809542131589073, + "flos": 568965441024.0, + "grad_norm": 0.07282479458874046, + "language_loss": 0.84656966, + "learning_rate": 0.0005550312728235654, + "loss": 0.85776907, + "num_input_tokens_seen": 207975632, + "router_z_loss_mlp": 0.16967773, + "step": 2500, + "time_per_iteration": 2.700421094894409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110812, + "balance_loss_mlp": 1.09159744, + "epoch": 0.4811465948441708, + "flos": 575994779136.0, + "grad_norm": 0.08404220746537734, + "language_loss": 0.83821297, + "learning_rate": 0.0005547216125756003, + "loss": 0.84929419, + "num_input_tokens_seen": 208048000, + "router_z_loss_mlp": 0.1652832, + "step": 2501, + "time_per_iteration": 2.7834067344665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106276, + "balance_loss_mlp": 1.08955085, + "epoch": 0.4813389765294344, + "flos": 823865209344.0, + "grad_norm": 0.07639679647694927, + "language_loss": 0.81906044, + "learning_rate": 0.0005544119310828211, + "loss": 0.83012319, + "num_input_tokens_seen": 208132592, + "router_z_loss_mlp": 0.16723633, + "step": 2502, + "time_per_iteration": 3.116422414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.09020913, + "epoch": 0.48153135821469795, + "flos": 635531816448.0, + "grad_norm": 0.07431223188319182, + "language_loss": 0.84573793, + "learning_rate": 0.0005541022284654568, + "loss": 0.85680836, + "num_input_tokens_seen": 208215824, + "router_z_loss_mlp": 0.16845703, + "step": 2503, + "time_per_iteration": 2.9265871047973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110287, + "balance_loss_mlp": 1.08615696, + "epoch": 0.48172373989996153, + "flos": 503701120512.0, + "grad_norm": 0.06355297884535237, + "language_loss": 0.83910048, + "learning_rate": 0.0005537925048437446, + "loss": 0.85012925, + "num_input_tokens_seen": 208284304, + "router_z_loss_mlp": 0.16723633, + "step": 2504, + "time_per_iteration": 2.6517508029937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087379, + "balance_loss_mlp": 1.07774711, + "epoch": 0.48191612158522507, + "flos": 1532362074624.0, + "grad_norm": 0.041815183909307344, + "language_loss": 0.75751472, + "learning_rate": 0.00055348276033793, + "loss": 0.76838851, + "num_input_tokens_seen": 208510224, + "router_z_loss_mlp": 0.09619141, + "step": 2505, + "time_per_iteration": 4.958322048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105183, + "balance_loss_mlp": 1.08836293, + "epoch": 0.48210850327048865, + "flos": 702424161792.0, + "grad_norm": 0.060666396845578126, + "language_loss": 0.88195586, + "learning_rate": 0.0005531729950682664, + "loss": 0.8930077, + "num_input_tokens_seen": 208596816, + "router_z_loss_mlp": 0.16833496, + "step": 2506, + "time_per_iteration": 3.0288734436035156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103671, + "balance_loss_mlp": 1.08631384, + "epoch": 0.4823008849557522, + "flos": 439778502144.0, + "grad_norm": 0.10090208417938805, + "language_loss": 0.84562349, + "learning_rate": 0.000552863209155015, + "loss": 0.85666019, + "num_input_tokens_seen": 208659616, + "router_z_loss_mlp": 0.17382812, + "step": 2507, + "time_per_iteration": 2.503030300140381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104399, + "balance_loss_mlp": 1.0873642, + "epoch": 0.48249326664101577, + "flos": 471859665408.0, + "grad_norm": 0.0644343170841742, + "language_loss": 0.82010555, + "learning_rate": 0.0005525534027184461, + "loss": 0.83114958, + "num_input_tokens_seen": 208728080, + "router_z_loss_mlp": 0.17053223, + "step": 2508, + "time_per_iteration": 2.563375949859619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115093, + "balance_loss_mlp": 1.09834397, + "epoch": 0.48268564832627936, + "flos": 563225674752.0, + "grad_norm": 0.20306769309253048, + "language_loss": 0.82742786, + "learning_rate": 0.0005522435758788365, + "loss": 0.83857882, + "num_input_tokens_seen": 208803376, + "router_z_loss_mlp": 0.16760254, + "step": 2509, + "time_per_iteration": 2.773317813873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107185, + "balance_loss_mlp": 1.08974481, + "epoch": 0.4828780300115429, + "flos": 629606670336.0, + "grad_norm": 0.08084829795782655, + "language_loss": 0.80297685, + "learning_rate": 0.0005519337287564721, + "loss": 0.81404877, + "num_input_tokens_seen": 208876656, + "router_z_loss_mlp": 0.17468262, + "step": 2510, + "time_per_iteration": 2.8417367935180664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109893, + "balance_loss_mlp": 1.09273911, + "epoch": 0.4830704116968065, + "flos": 631850305536.0, + "grad_norm": 0.07005467856459312, + "language_loss": 0.83318454, + "learning_rate": 0.000551623861471646, + "loss": 0.84428346, + "num_input_tokens_seen": 208950224, + "router_z_loss_mlp": 0.17175293, + "step": 2511, + "time_per_iteration": 4.144210577011108 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031589, + "balance_loss_mlp": 1.02186131, + "epoch": 0.48326279338207, + "flos": 1569268588032.0, + "grad_norm": 0.022823457387693702, + "language_loss": 0.78818834, + "learning_rate": 0.0005513139741446594, + "loss": 0.79850423, + "num_input_tokens_seen": 209173984, + "router_z_loss_mlp": 0.09716797, + "step": 2512, + "time_per_iteration": 4.846112489700317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105484, + "balance_loss_mlp": 1.08805561, + "epoch": 0.4834551750673336, + "flos": 509238254592.0, + "grad_norm": 0.06582055063949785, + "language_loss": 0.86307418, + "learning_rate": 0.0005510040668958211, + "loss": 0.87412906, + "num_input_tokens_seen": 209242832, + "router_z_loss_mlp": 0.17443848, + "step": 2513, + "time_per_iteration": 2.5893678665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_mlp": 1.01802599, + "epoch": 0.48364755675259713, + "flos": 1528663311360.0, + "grad_norm": 0.018178820637651416, + "language_loss": 0.77760583, + "learning_rate": 0.0005506941398454483, + "loss": 0.78788525, + "num_input_tokens_seen": 209473520, + "router_z_loss_mlp": 0.09912109, + "step": 2514, + "time_per_iteration": 4.883544445037842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104231, + "balance_loss_mlp": 1.08638501, + "epoch": 0.4838399384378607, + "flos": 564989893632.0, + "grad_norm": 0.07451301520475437, + "language_loss": 0.83174801, + "learning_rate": 0.0005503841931138645, + "loss": 0.84279031, + "num_input_tokens_seen": 209544208, + "router_z_loss_mlp": 0.17858887, + "step": 2515, + "time_per_iteration": 2.6821184158325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099055, + "balance_loss_mlp": 1.0817579, + "epoch": 0.4840323201231243, + "flos": 387691121664.0, + "grad_norm": 0.1026377711865236, + "language_loss": 0.81650221, + "learning_rate": 0.0005500742268214025, + "loss": 0.82749277, + "num_input_tokens_seen": 209607408, + "router_z_loss_mlp": 0.17321777, + "step": 2516, + "time_per_iteration": 2.501392364501953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094696, + "balance_loss_mlp": 1.07677877, + "epoch": 0.48422470180838784, + "flos": 630995360256.0, + "grad_norm": 0.06104395933883966, + "language_loss": 0.85527956, + "learning_rate": 0.0005497642410884014, + "loss": 0.86622655, + "num_input_tokens_seen": 209683392, + "router_z_loss_mlp": 0.17919922, + "step": 2517, + "time_per_iteration": 2.7879879474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092849, + "balance_loss_mlp": 1.07494426, + "epoch": 0.4844170834936514, + "flos": 499226333184.0, + "grad_norm": 0.0763804859448823, + "language_loss": 0.85418707, + "learning_rate": 0.0005494542360352085, + "loss": 0.86511558, + "num_input_tokens_seen": 209753184, + "router_z_loss_mlp": 0.17919922, + "step": 2518, + "time_per_iteration": 2.705934762954712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098599, + "balance_loss_mlp": 1.0811708, + "epoch": 0.48460946517891496, + "flos": 551076576768.0, + "grad_norm": 0.07348525281964927, + "language_loss": 0.855097, + "learning_rate": 0.0005491442117821783, + "loss": 0.86608291, + "num_input_tokens_seen": 209829568, + "router_z_loss_mlp": 0.17456055, + "step": 2519, + "time_per_iteration": 2.7056097984313965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097149, + "balance_loss_mlp": 1.07910061, + "epoch": 0.48480184686417854, + "flos": 529390204416.0, + "grad_norm": 0.07963371062569355, + "language_loss": 0.87741303, + "learning_rate": 0.0005488341684496732, + "loss": 0.88838446, + "num_input_tokens_seen": 209902176, + "router_z_loss_mlp": 0.18054199, + "step": 2520, + "time_per_iteration": 2.6991913318634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107269, + "balance_loss_mlp": 1.08979297, + "epoch": 0.4849942285494421, + "flos": 531912821760.0, + "grad_norm": 0.06522694836378315, + "language_loss": 0.91749704, + "learning_rate": 0.0005485241061580624, + "loss": 0.92856967, + "num_input_tokens_seen": 209969168, + "router_z_loss_mlp": 0.17480469, + "step": 2521, + "time_per_iteration": 2.751336097717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111149, + "balance_loss_mlp": 1.09335089, + "epoch": 0.48518661023470566, + "flos": 722578682880.0, + "grad_norm": 0.0788581364531382, + "language_loss": 0.84810591, + "learning_rate": 0.0005482140250277228, + "loss": 0.85921741, + "num_input_tokens_seen": 210049616, + "router_z_loss_mlp": 0.17797852, + "step": 2522, + "time_per_iteration": 3.012603759765625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116154, + "balance_loss_mlp": 1.09896421, + "epoch": 0.4853789919199692, + "flos": 506105169408.0, + "grad_norm": 0.081531881919659, + "language_loss": 0.87781787, + "learning_rate": 0.0005479039251790387, + "loss": 0.88897943, + "num_input_tokens_seen": 210118512, + "router_z_loss_mlp": 0.17211914, + "step": 2523, + "time_per_iteration": 2.6643292903900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115817, + "balance_loss_mlp": 1.0985198, + "epoch": 0.4855713736052328, + "flos": 660487094784.0, + "grad_norm": 0.1008566510750689, + "language_loss": 0.84847081, + "learning_rate": 0.0005475938067324014, + "loss": 0.85962898, + "num_input_tokens_seen": 210193728, + "router_z_loss_mlp": 0.1730957, + "step": 2524, + "time_per_iteration": 2.8631820678710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129536, + "balance_loss_mlp": 1.11252499, + "epoch": 0.48576375529049637, + "flos": 436959277056.0, + "grad_norm": 0.08592622698203999, + "language_loss": 0.83456719, + "learning_rate": 0.0005472836698082098, + "loss": 0.84586251, + "num_input_tokens_seen": 210258832, + "router_z_loss_mlp": 0.17028809, + "step": 2525, + "time_per_iteration": 2.5364460945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109588, + "balance_loss_mlp": 1.09244525, + "epoch": 0.4859561369757599, + "flos": 581707381248.0, + "grad_norm": 0.06952957834620052, + "language_loss": 0.8412683, + "learning_rate": 0.0005469735145268694, + "loss": 0.85236418, + "num_input_tokens_seen": 210335280, + "router_z_loss_mlp": 0.17138672, + "step": 2526, + "time_per_iteration": 2.766571283340454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106022, + "balance_loss_mlp": 1.08884394, + "epoch": 0.4861485186610235, + "flos": 487964487168.0, + "grad_norm": 0.07975413334667165, + "language_loss": 0.80809188, + "learning_rate": 0.0005466633410087933, + "loss": 0.81915212, + "num_input_tokens_seen": 210407072, + "router_z_loss_mlp": 0.171875, + "step": 2527, + "time_per_iteration": 2.738344192504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072106, + "balance_loss_mlp": 1.06094766, + "epoch": 0.486340900346287, + "flos": 1557734727168.0, + "grad_norm": 0.03644390169401177, + "language_loss": 0.77260822, + "learning_rate": 0.0005463531493744017, + "loss": 0.78332925, + "num_input_tokens_seen": 210644544, + "router_z_loss_mlp": 0.11181641, + "step": 2528, + "time_per_iteration": 4.871282339096069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090172, + "balance_loss_mlp": 1.07268429, + "epoch": 0.4865332820315506, + "flos": 483005514240.0, + "grad_norm": 0.06987485087243678, + "language_loss": 0.87962806, + "learning_rate": 0.0005460429397441214, + "loss": 0.89052981, + "num_input_tokens_seen": 210711760, + "router_z_loss_mlp": 0.1751709, + "step": 2529, + "time_per_iteration": 2.589794635772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097683, + "balance_loss_mlp": 1.08112478, + "epoch": 0.48672566371681414, + "flos": 535809447936.0, + "grad_norm": 0.08125917870845005, + "language_loss": 0.86507833, + "learning_rate": 0.0005457327122383866, + "loss": 0.87605512, + "num_input_tokens_seen": 210783040, + "router_z_loss_mlp": 0.16564941, + "step": 2530, + "time_per_iteration": 2.633769989013672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024086, + "balance_loss_mlp": 1.01402473, + "epoch": 0.4869180454020777, + "flos": 1412665422336.0, + "grad_norm": 0.019350247330642424, + "language_loss": 0.74636483, + "learning_rate": 0.0005454224669776385, + "loss": 0.75660574, + "num_input_tokens_seen": 211002128, + "router_z_loss_mlp": 0.10058594, + "step": 2531, + "time_per_iteration": 4.829160213470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111265, + "balance_loss_mlp": 1.09450376, + "epoch": 0.48711042708734126, + "flos": 573113885184.0, + "grad_norm": 0.07679109022151961, + "language_loss": 0.7589134, + "learning_rate": 0.0005451122040823244, + "loss": 0.77002603, + "num_input_tokens_seen": 211080080, + "router_z_loss_mlp": 0.16760254, + "step": 2532, + "time_per_iteration": 2.809295654296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113381, + "balance_loss_mlp": 1.09582114, + "epoch": 0.48730280877260485, + "flos": 626547737088.0, + "grad_norm": 0.07652021477742418, + "language_loss": 0.76977062, + "learning_rate": 0.0005448019236728997, + "loss": 0.78090441, + "num_input_tokens_seen": 211162944, + "router_z_loss_mlp": 0.17565918, + "step": 2533, + "time_per_iteration": 2.889730930328369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111676, + "balance_loss_mlp": 1.09540379, + "epoch": 0.48749519045786843, + "flos": 512479996416.0, + "grad_norm": 0.08912362185496442, + "language_loss": 0.84908152, + "learning_rate": 0.0005444916258698255, + "loss": 0.86019826, + "num_input_tokens_seen": 211230448, + "router_z_loss_mlp": 0.16271973, + "step": 2534, + "time_per_iteration": 2.6680796146392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109562, + "balance_loss_mlp": 1.09297991, + "epoch": 0.48768757214313196, + "flos": 525414657024.0, + "grad_norm": 0.06587099405348051, + "language_loss": 0.85898745, + "learning_rate": 0.0005441813107935704, + "loss": 0.87008309, + "num_input_tokens_seen": 211301248, + "router_z_loss_mlp": 0.16589355, + "step": 2535, + "time_per_iteration": 2.708963394165039 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121617, + "balance_loss_mlp": 1.10494018, + "epoch": 0.48787995382839555, + "flos": 505032910848.0, + "grad_norm": 0.07506618076199813, + "language_loss": 0.856264, + "learning_rate": 0.0005438709785646091, + "loss": 0.86748016, + "num_input_tokens_seen": 211369888, + "router_z_loss_mlp": 0.16687012, + "step": 2536, + "time_per_iteration": 2.5794246196746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111352, + "balance_loss_mlp": 1.0970813, + "epoch": 0.4880723355136591, + "flos": 575172140544.0, + "grad_norm": 0.06872348733444625, + "language_loss": 0.86540043, + "learning_rate": 0.0005435606293034234, + "loss": 0.87653565, + "num_input_tokens_seen": 211441808, + "router_z_loss_mlp": 0.16442871, + "step": 2537, + "time_per_iteration": 2.663050889968872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116159, + "balance_loss_mlp": 1.0999465, + "epoch": 0.48826471719892267, + "flos": 561444203520.0, + "grad_norm": 0.09164692396838796, + "language_loss": 0.84696114, + "learning_rate": 0.0005432502631305016, + "loss": 0.85812277, + "num_input_tokens_seen": 211511216, + "router_z_loss_mlp": 0.16210938, + "step": 2538, + "time_per_iteration": 2.7034809589385986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119353, + "balance_loss_mlp": 1.10295033, + "epoch": 0.4884570988841862, + "flos": 726188613120.0, + "grad_norm": 0.06227186407680876, + "language_loss": 0.82968855, + "learning_rate": 0.0005429398801663386, + "loss": 0.84088206, + "num_input_tokens_seen": 211589264, + "router_z_loss_mlp": 0.1640625, + "step": 2539, + "time_per_iteration": 3.0155930519104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120209, + "balance_loss_mlp": 1.10398471, + "epoch": 0.4886494805694498, + "flos": 431019449856.0, + "grad_norm": 0.10714048411465311, + "language_loss": 0.82757926, + "learning_rate": 0.0005426294805314355, + "loss": 0.83878136, + "num_input_tokens_seen": 211652928, + "router_z_loss_mlp": 0.16223145, + "step": 2540, + "time_per_iteration": 2.5441384315490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115337, + "balance_loss_mlp": 1.09914827, + "epoch": 0.4888418622547134, + "flos": 673006579200.0, + "grad_norm": 0.08648554978838247, + "language_loss": 0.79954243, + "learning_rate": 0.0005423190643463003, + "loss": 0.81069577, + "num_input_tokens_seen": 211741664, + "router_z_loss_mlp": 0.16186523, + "step": 2541, + "time_per_iteration": 2.992694854736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112211, + "balance_loss_mlp": 1.0954504, + "epoch": 0.4890342439399769, + "flos": 541897579008.0, + "grad_norm": 0.08541624697499144, + "language_loss": 0.82913029, + "learning_rate": 0.0005420086317314473, + "loss": 0.84025246, + "num_input_tokens_seen": 211809136, + "router_z_loss_mlp": 0.16772461, + "step": 2542, + "time_per_iteration": 2.658069133758545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104299, + "balance_loss_mlp": 1.08720386, + "epoch": 0.4892266256252405, + "flos": 590676406272.0, + "grad_norm": 0.06935244738816776, + "language_loss": 0.80814946, + "learning_rate": 0.0005416981828073971, + "loss": 0.81919247, + "num_input_tokens_seen": 211883136, + "router_z_loss_mlp": 0.17102051, + "step": 2543, + "time_per_iteration": 2.818812608718872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_mlp": 1.02991831, + "epoch": 0.48941900731050403, + "flos": 1516296526848.0, + "grad_norm": 0.020152649211275964, + "language_loss": 0.77115011, + "learning_rate": 0.0005413877176946765, + "loss": 0.78154421, + "num_input_tokens_seen": 212117488, + "router_z_loss_mlp": 0.09472656, + "step": 2544, + "time_per_iteration": 4.891278028488159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100982, + "balance_loss_mlp": 1.08363652, + "epoch": 0.4896113889957676, + "flos": 470564951040.0, + "grad_norm": 0.07927159683050183, + "language_loss": 0.85168952, + "learning_rate": 0.000541077236513819, + "loss": 0.86269933, + "num_input_tokens_seen": 212181952, + "router_z_loss_mlp": 0.17346191, + "step": 2545, + "time_per_iteration": 2.589184045791626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094215, + "balance_loss_mlp": 1.07689393, + "epoch": 0.48980377068103115, + "flos": 496557983232.0, + "grad_norm": 0.06748793045052295, + "language_loss": 0.82038838, + "learning_rate": 0.0005407667393853638, + "loss": 0.83133048, + "num_input_tokens_seen": 212252608, + "router_z_loss_mlp": 0.17333984, + "step": 2546, + "time_per_iteration": 2.6306400299072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099707, + "balance_loss_mlp": 1.08196878, + "epoch": 0.48999615236629473, + "flos": 692852382720.0, + "grad_norm": 0.08073962926855084, + "language_loss": 0.83248717, + "learning_rate": 0.0005404562264298569, + "loss": 0.84348422, + "num_input_tokens_seen": 212328560, + "router_z_loss_mlp": 0.17749023, + "step": 2547, + "time_per_iteration": 2.890744209289551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097618, + "balance_loss_mlp": 1.0795579, + "epoch": 0.49018853405155827, + "flos": 541694946816.0, + "grad_norm": 0.07477586030938296, + "language_loss": 0.83869213, + "learning_rate": 0.0005401456977678498, + "loss": 0.84966832, + "num_input_tokens_seen": 212399616, + "router_z_loss_mlp": 0.18078613, + "step": 2548, + "time_per_iteration": 2.691488027572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093416, + "balance_loss_mlp": 1.0753082, + "epoch": 0.49038091573682185, + "flos": 695663894016.0, + "grad_norm": 0.08381067722766777, + "language_loss": 0.77390134, + "learning_rate": 0.0005398351535199008, + "loss": 0.78483546, + "num_input_tokens_seen": 212482352, + "router_z_loss_mlp": 0.18103027, + "step": 2549, + "time_per_iteration": 3.0651490688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087007, + "balance_loss_mlp": 1.06931591, + "epoch": 0.49057329742208544, + "flos": 596902929408.0, + "grad_norm": 0.05957811074119609, + "language_loss": 0.83473563, + "learning_rate": 0.0005395245938065735, + "loss": 0.84560567, + "num_input_tokens_seen": 212559504, + "router_z_loss_mlp": 0.17712402, + "step": 2550, + "time_per_iteration": 2.7947916984558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085104, + "balance_loss_mlp": 1.06648386, + "epoch": 0.490765679107349, + "flos": 513406522368.0, + "grad_norm": 0.10016911025461137, + "language_loss": 0.82528293, + "learning_rate": 0.0005392140187484379, + "loss": 0.83613402, + "num_input_tokens_seen": 212625664, + "router_z_loss_mlp": 0.18603516, + "step": 2551, + "time_per_iteration": 2.6254496574401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089241, + "balance_loss_mlp": 1.0698818, + "epoch": 0.49095806079261256, + "flos": 629606670336.0, + "grad_norm": 0.05979290752357133, + "language_loss": 0.89496678, + "learning_rate": 0.0005389034284660701, + "loss": 0.90585923, + "num_input_tokens_seen": 212702000, + "router_z_loss_mlp": 0.19348145, + "step": 2552, + "time_per_iteration": 2.8202950954437256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096651, + "balance_loss_mlp": 1.07798314, + "epoch": 0.4911504424778761, + "flos": 915307941888.0, + "grad_norm": 0.09877873271676557, + "language_loss": 0.82097638, + "learning_rate": 0.000538592823080052, + "loss": 0.83194292, + "num_input_tokens_seen": 212785376, + "router_z_loss_mlp": 0.18676758, + "step": 2553, + "time_per_iteration": 3.156975507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092631, + "balance_loss_mlp": 1.07395101, + "epoch": 0.4913428241631397, + "flos": 438943380480.0, + "grad_norm": 0.1092160541841064, + "language_loss": 0.84523845, + "learning_rate": 0.000538282202710971, + "loss": 0.85616469, + "num_input_tokens_seen": 212848176, + "router_z_loss_mlp": 0.18664551, + "step": 2554, + "time_per_iteration": 2.5290331840515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109959, + "balance_loss_mlp": 1.08045673, + "epoch": 0.4915352058484032, + "flos": 636092725248.0, + "grad_norm": 0.10555847882945492, + "language_loss": 0.82219321, + "learning_rate": 0.000537971567479421, + "loss": 0.83318907, + "num_input_tokens_seen": 212917888, + "router_z_loss_mlp": 0.19128418, + "step": 2555, + "time_per_iteration": 2.755554437637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094808, + "balance_loss_mlp": 1.07547224, + "epoch": 0.4917275875336668, + "flos": 504518989824.0, + "grad_norm": 0.0816634604134734, + "language_loss": 0.87386465, + "learning_rate": 0.0005376609175060011, + "loss": 0.88481277, + "num_input_tokens_seen": 212986288, + "router_z_loss_mlp": 0.19311523, + "step": 2556, + "time_per_iteration": 2.6251890659332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088346, + "balance_loss_mlp": 1.06941605, + "epoch": 0.49191996921893033, + "flos": 654547267584.0, + "grad_norm": 0.1007754916439506, + "language_loss": 0.80408537, + "learning_rate": 0.0005373502529113162, + "loss": 0.81496882, + "num_input_tokens_seen": 213059504, + "router_z_loss_mlp": 0.18920898, + "step": 2557, + "time_per_iteration": 2.8081767559051514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080378, + "balance_loss_mlp": 1.06081533, + "epoch": 0.4921123509041939, + "flos": 492359980032.0, + "grad_norm": 0.09200682846254944, + "language_loss": 0.81391776, + "learning_rate": 0.0005370395738159773, + "loss": 0.82472152, + "num_input_tokens_seen": 213129984, + "router_z_loss_mlp": 0.19543457, + "step": 2558, + "time_per_iteration": 2.6609818935394287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084764, + "balance_loss_mlp": 1.06559491, + "epoch": 0.4923047325894575, + "flos": 546167162880.0, + "grad_norm": 0.08064506015832804, + "language_loss": 0.82711154, + "learning_rate": 0.0005367288803406003, + "loss": 0.83795917, + "num_input_tokens_seen": 213199184, + "router_z_loss_mlp": 0.19165039, + "step": 2559, + "time_per_iteration": 2.644026756286621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084251, + "balance_loss_mlp": 1.06544018, + "epoch": 0.49249711427472104, + "flos": 596473072128.0, + "grad_norm": 0.0889068964261426, + "language_loss": 0.81602907, + "learning_rate": 0.0005364181726058073, + "loss": 0.82687151, + "num_input_tokens_seen": 213272480, + "router_z_loss_mlp": 0.18798828, + "step": 2560, + "time_per_iteration": 2.7356274127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082609, + "balance_loss_mlp": 1.06403637, + "epoch": 0.4926894959599846, + "flos": 497825533440.0, + "grad_norm": 0.0950227496854857, + "language_loss": 0.82278556, + "learning_rate": 0.0005361074507322261, + "loss": 0.83361161, + "num_input_tokens_seen": 213338704, + "router_z_loss_mlp": 0.18566895, + "step": 2561, + "time_per_iteration": 2.663046360015869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06827641, + "epoch": 0.49288187764524816, + "flos": 536130648576.0, + "grad_norm": 0.07772582275378431, + "language_loss": 0.81617248, + "learning_rate": 0.000535796714840489, + "loss": 0.82704192, + "num_input_tokens_seen": 213406016, + "router_z_loss_mlp": 0.18664551, + "step": 2562, + "time_per_iteration": 2.638414144515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.07574439, + "epoch": 0.49307425933051174, + "flos": 641555707392.0, + "grad_norm": 0.08606941059340069, + "language_loss": 0.83548921, + "learning_rate": 0.0005354859650512348, + "loss": 0.84643233, + "num_input_tokens_seen": 213474016, + "router_z_loss_mlp": 0.18566895, + "step": 2563, + "time_per_iteration": 2.786123752593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103932, + "balance_loss_mlp": 1.08636093, + "epoch": 0.4932666410157753, + "flos": 516252911616.0, + "grad_norm": 0.10665890037430359, + "language_loss": 0.87337875, + "learning_rate": 0.0005351752014851074, + "loss": 0.88441813, + "num_input_tokens_seen": 213539696, + "router_z_loss_mlp": 0.17578125, + "step": 2564, + "time_per_iteration": 2.5858397483825684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110675, + "balance_loss_mlp": 1.08847523, + "epoch": 0.49345902270103886, + "flos": 601503625728.0, + "grad_norm": 0.10057993561194663, + "language_loss": 0.83317149, + "learning_rate": 0.0005348644242627553, + "loss": 0.844239, + "num_input_tokens_seen": 213609504, + "router_z_loss_mlp": 0.1829834, + "step": 2565, + "time_per_iteration": 2.7638742923736572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050217, + "balance_loss_mlp": 1.04082322, + "epoch": 0.49365140438630245, + "flos": 1493673716736.0, + "grad_norm": 0.03479988729177956, + "language_loss": 0.75286627, + "learning_rate": 0.0005345536335048336, + "loss": 0.76336837, + "num_input_tokens_seen": 213846064, + "router_z_loss_mlp": 0.09375, + "step": 2566, + "time_per_iteration": 4.947393417358398 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106718, + "balance_loss_mlp": 1.08951592, + "epoch": 0.493843786071566, + "flos": 629599329792.0, + "grad_norm": 0.06927642597141821, + "language_loss": 0.81322002, + "learning_rate": 0.0005342428293320013, + "loss": 0.82428724, + "num_input_tokens_seen": 213923216, + "router_z_loss_mlp": 0.17199707, + "step": 2567, + "time_per_iteration": 2.778985023498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104881, + "balance_loss_mlp": 1.08785808, + "epoch": 0.49403616775682957, + "flos": 617564030976.0, + "grad_norm": 0.07155621127563581, + "language_loss": 0.83412832, + "learning_rate": 0.0005339320118649238, + "loss": 0.84517711, + "num_input_tokens_seen": 213994096, + "router_z_loss_mlp": 0.17041016, + "step": 2568, + "time_per_iteration": 2.7361106872558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118929, + "balance_loss_mlp": 1.10148847, + "epoch": 0.4942285494420931, + "flos": 577647770112.0, + "grad_norm": 0.06786367407396048, + "language_loss": 0.86708534, + "learning_rate": 0.000533621181224271, + "loss": 0.87827462, + "num_input_tokens_seen": 214069104, + "router_z_loss_mlp": 0.17443848, + "step": 2569, + "time_per_iteration": 2.8056747913360596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113987, + "balance_loss_mlp": 1.09679675, + "epoch": 0.4944209311273567, + "flos": 630211995648.0, + "grad_norm": 0.08062562134183447, + "language_loss": 0.81321245, + "learning_rate": 0.0005333103375307182, + "loss": 0.82435232, + "num_input_tokens_seen": 214150368, + "router_z_loss_mlp": 0.17211914, + "step": 2570, + "time_per_iteration": 2.904440402984619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114683, + "balance_loss_mlp": 1.09786248, + "epoch": 0.4946133128126202, + "flos": 587612703744.0, + "grad_norm": 0.06756621210058887, + "language_loss": 0.8584491, + "learning_rate": 0.0005329994809049451, + "loss": 0.86959589, + "num_input_tokens_seen": 214220112, + "router_z_loss_mlp": 0.16833496, + "step": 2571, + "time_per_iteration": 2.8053295612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131581, + "balance_loss_mlp": 1.11458206, + "epoch": 0.4948056944978838, + "flos": 583718648832.0, + "grad_norm": 0.09358938815201079, + "language_loss": 0.87904042, + "learning_rate": 0.0005326886114676375, + "loss": 0.89035624, + "num_input_tokens_seen": 214294480, + "router_z_loss_mlp": 0.17016602, + "step": 2572, + "time_per_iteration": 2.8100666999816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113844, + "balance_loss_mlp": 1.09724987, + "epoch": 0.49499807618314734, + "flos": 481822027776.0, + "grad_norm": 0.06954374103744322, + "language_loss": 0.87645632, + "learning_rate": 0.0005323777293394854, + "loss": 0.88759476, + "num_input_tokens_seen": 214359568, + "router_z_loss_mlp": 0.16601562, + "step": 2573, + "time_per_iteration": 2.6342670917510986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112174, + "balance_loss_mlp": 1.09544909, + "epoch": 0.4951904578684109, + "flos": 518978161152.0, + "grad_norm": 0.06551139751330846, + "language_loss": 0.82055044, + "learning_rate": 0.000532066834641184, + "loss": 0.83167219, + "num_input_tokens_seen": 214432032, + "router_z_loss_mlp": 0.1673584, + "step": 2574, + "time_per_iteration": 2.7459301948547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115153, + "balance_loss_mlp": 1.09861851, + "epoch": 0.4953828395536745, + "flos": 535505499648.0, + "grad_norm": 0.07271172156944823, + "language_loss": 0.85062492, + "learning_rate": 0.0005317559274934334, + "loss": 0.86177647, + "num_input_tokens_seen": 214504096, + "router_z_loss_mlp": 0.16540527, + "step": 2575, + "time_per_iteration": 2.79950213432312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109887, + "balance_loss_mlp": 1.0929718, + "epoch": 0.49557522123893805, + "flos": 528564994560.0, + "grad_norm": 0.12491917898667039, + "language_loss": 0.80294836, + "learning_rate": 0.0005314450080169382, + "loss": 0.81404722, + "num_input_tokens_seen": 214575920, + "router_z_loss_mlp": 0.16931152, + "step": 2576, + "time_per_iteration": 2.646117687225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111519, + "balance_loss_mlp": 1.09459102, + "epoch": 0.49576760292420163, + "flos": 428007504384.0, + "grad_norm": 0.06948953090692808, + "language_loss": 0.80618382, + "learning_rate": 0.0005311340763324083, + "loss": 0.81729901, + "num_input_tokens_seen": 214641664, + "router_z_loss_mlp": 0.16931152, + "step": 2577, + "time_per_iteration": 2.637355327606201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115745, + "balance_loss_mlp": 1.09885335, + "epoch": 0.49595998460946517, + "flos": 565236942336.0, + "grad_norm": 0.06343391975743103, + "language_loss": 0.82572562, + "learning_rate": 0.0005308231325605578, + "loss": 0.83688301, + "num_input_tokens_seen": 214711744, + "router_z_loss_mlp": 0.16906738, + "step": 2578, + "time_per_iteration": 2.7532670497894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112411, + "balance_loss_mlp": 1.10721767, + "epoch": 0.49615236629472875, + "flos": 702490973184.0, + "grad_norm": 0.06763129936720796, + "language_loss": 0.76589197, + "learning_rate": 0.0005305121768221061, + "loss": 0.77713311, + "num_input_tokens_seen": 214802256, + "router_z_loss_mlp": 0.16906738, + "step": 2579, + "time_per_iteration": 3.099548816680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106106, + "balance_loss_mlp": 1.09718919, + "epoch": 0.4963447479799923, + "flos": 1441665630720.0, + "grad_norm": 0.03611799224355641, + "language_loss": 0.75038326, + "learning_rate": 0.000530201209237777, + "loss": 0.76144433, + "num_input_tokens_seen": 215023648, + "router_z_loss_mlp": 0.08935547, + "step": 2580, + "time_per_iteration": 4.822290658950806 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112299, + "balance_loss_mlp": 1.0955143, + "epoch": 0.49653712966525587, + "flos": 537627995136.0, + "grad_norm": 0.07683784808208224, + "language_loss": 0.91874099, + "learning_rate": 0.0005298902299282984, + "loss": 0.92986393, + "num_input_tokens_seen": 215094080, + "router_z_loss_mlp": 0.16796875, + "step": 2581, + "time_per_iteration": 2.6493284702301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117923, + "balance_loss_mlp": 1.10141301, + "epoch": 0.4967295113505194, + "flos": 607280467968.0, + "grad_norm": 0.09118838704679054, + "language_loss": 0.84425116, + "learning_rate": 0.0005295792390144033, + "loss": 0.85543042, + "num_input_tokens_seen": 215165456, + "router_z_loss_mlp": 0.16516113, + "step": 2582, + "time_per_iteration": 2.8000099658966064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121007, + "balance_loss_mlp": 1.1042583, + "epoch": 0.496921893035783, + "flos": 474577574400.0, + "grad_norm": 0.08989559260345804, + "language_loss": 0.83660305, + "learning_rate": 0.0005292682366168294, + "loss": 0.84781313, + "num_input_tokens_seen": 215229344, + "router_z_loss_mlp": 0.16760254, + "step": 2583, + "time_per_iteration": 2.573913812637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116101, + "balance_loss_mlp": 1.0993638, + "epoch": 0.4971142747210466, + "flos": 597463838208.0, + "grad_norm": 0.07863246165846992, + "language_loss": 0.79766655, + "learning_rate": 0.0005289572228563181, + "loss": 0.80882752, + "num_input_tokens_seen": 215305616, + "router_z_loss_mlp": 0.16748047, + "step": 2584, + "time_per_iteration": 2.807269811630249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114321, + "balance_loss_mlp": 1.09676123, + "epoch": 0.4973066564063101, + "flos": 599603586048.0, + "grad_norm": 0.06809186764850061, + "language_loss": 0.8288846, + "learning_rate": 0.000528646197853616, + "loss": 0.84002781, + "num_input_tokens_seen": 215378128, + "router_z_loss_mlp": 0.17578125, + "step": 2585, + "time_per_iteration": 2.806168794631958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114035, + "balance_loss_mlp": 1.09709597, + "epoch": 0.4974990380915737, + "flos": 649474495488.0, + "grad_norm": 0.06908816819532054, + "language_loss": 0.85582453, + "learning_rate": 0.0005283351617294735, + "loss": 0.86696494, + "num_input_tokens_seen": 215453536, + "router_z_loss_mlp": 0.16943359, + "step": 2586, + "time_per_iteration": 2.926912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_mlp": 1.02630937, + "epoch": 0.49769141977683723, + "flos": 1529278548480.0, + "grad_norm": 0.01596603428611825, + "language_loss": 0.7663666, + "learning_rate": 0.0005280241146046456, + "loss": 0.77671409, + "num_input_tokens_seen": 215689440, + "router_z_loss_mlp": 0.08447266, + "step": 2587, + "time_per_iteration": 5.0390965938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107376, + "balance_loss_mlp": 1.08937573, + "epoch": 0.4978838014621008, + "flos": 536370356736.0, + "grad_norm": 0.06339397332392985, + "language_loss": 0.86461538, + "learning_rate": 0.0005277130565998916, + "loss": 0.87568915, + "num_input_tokens_seen": 215759600, + "router_z_loss_mlp": 0.18005371, + "step": 2588, + "time_per_iteration": 2.770092248916626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116474, + "balance_loss_mlp": 1.09942722, + "epoch": 0.49807618314736435, + "flos": 539616867840.0, + "grad_norm": 0.058229952595652015, + "language_loss": 0.81859887, + "learning_rate": 0.0005274019878359748, + "loss": 0.82976359, + "num_input_tokens_seen": 215833920, + "router_z_loss_mlp": 0.17053223, + "step": 2589, + "time_per_iteration": 2.7338075637817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114654, + "balance_loss_mlp": 1.09733331, + "epoch": 0.49826856483262794, + "flos": 542475740160.0, + "grad_norm": 0.09126406549336552, + "language_loss": 0.86714995, + "learning_rate": 0.0005270909084336628, + "loss": 0.87829649, + "num_input_tokens_seen": 215903616, + "router_z_loss_mlp": 0.17333984, + "step": 2590, + "time_per_iteration": 2.65108323097229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116441, + "balance_loss_mlp": 1.09858298, + "epoch": 0.4984609465178915, + "flos": 522321219072.0, + "grad_norm": 0.1060624554819127, + "language_loss": 0.88702905, + "learning_rate": 0.0005267798185137276, + "loss": 0.89819348, + "num_input_tokens_seen": 215974832, + "router_z_loss_mlp": 0.17871094, + "step": 2591, + "time_per_iteration": 2.6553287506103516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105559, + "balance_loss_mlp": 1.08758211, + "epoch": 0.49865332820315506, + "flos": 574544420352.0, + "grad_norm": 0.13093350294478928, + "language_loss": 0.88770413, + "learning_rate": 0.0005264687181969444, + "loss": 0.89875972, + "num_input_tokens_seen": 216045024, + "router_z_loss_mlp": 0.17980957, + "step": 2592, + "time_per_iteration": 2.7969043254852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110929, + "balance_loss_mlp": 1.0928092, + "epoch": 0.49884570988841864, + "flos": 1013607115776.0, + "grad_norm": 0.07529154121690083, + "language_loss": 0.74930251, + "learning_rate": 0.0005261576076040937, + "loss": 0.76041174, + "num_input_tokens_seen": 216129024, + "router_z_loss_mlp": 0.18127441, + "step": 2593, + "time_per_iteration": 3.3571712970733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101158, + "balance_loss_mlp": 1.08368254, + "epoch": 0.4990380915736822, + "flos": 559581239808.0, + "grad_norm": 0.07032432999454871, + "language_loss": 0.83977568, + "learning_rate": 0.0005258464868559591, + "loss": 0.85078728, + "num_input_tokens_seen": 216197648, + "router_z_loss_mlp": 0.17492676, + "step": 2594, + "time_per_iteration": 2.691549301147461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102198, + "balance_loss_mlp": 1.08469868, + "epoch": 0.49923047325894576, + "flos": 498954691584.0, + "grad_norm": 0.06016242034808734, + "language_loss": 0.88749588, + "learning_rate": 0.0005255353560733284, + "loss": 0.89851785, + "num_input_tokens_seen": 216263904, + "router_z_loss_mlp": 0.17529297, + "step": 2595, + "time_per_iteration": 2.643775701522827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074264, + "balance_loss_mlp": 1.0654906, + "epoch": 0.4994228549442093, + "flos": 1496636476416.0, + "grad_norm": 0.03161132267250996, + "language_loss": 0.75578642, + "learning_rate": 0.0005252242153769931, + "loss": 0.76652908, + "num_input_tokens_seen": 216493152, + "router_z_loss_mlp": 0.08789062, + "step": 2596, + "time_per_iteration": 4.8261682987213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011005, + "balance_loss_mlp": 1.08255887, + "epoch": 0.4996152366294729, + "flos": 557374680576.0, + "grad_norm": 0.06872371897226848, + "language_loss": 0.83470559, + "learning_rate": 0.0005249130648877492, + "loss": 0.84571064, + "num_input_tokens_seen": 216567216, + "router_z_loss_mlp": 0.17956543, + "step": 2597, + "time_per_iteration": 2.793973445892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099762, + "balance_loss_mlp": 1.08096313, + "epoch": 0.4998076183147364, + "flos": 415594105344.0, + "grad_norm": 0.07739235171207769, + "language_loss": 0.84593171, + "learning_rate": 0.0005246019047263953, + "loss": 0.8569293, + "num_input_tokens_seen": 216630624, + "router_z_loss_mlp": 0.18798828, + "step": 2598, + "time_per_iteration": 2.5284597873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103028, + "balance_loss_mlp": 1.08447933, + "epoch": 0.5, + "flos": 467350373376.0, + "grad_norm": 0.0766017052589062, + "language_loss": 0.82300264, + "learning_rate": 0.0005242907350137353, + "loss": 0.83403295, + "num_input_tokens_seen": 216696576, + "router_z_loss_mlp": 0.18554688, + "step": 2599, + "time_per_iteration": 2.57824969291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102331, + "balance_loss_mlp": 1.08466387, + "epoch": 0.5001923816852636, + "flos": 482718818304.0, + "grad_norm": 0.07109220242790512, + "language_loss": 0.78955519, + "learning_rate": 0.0005239795558705754, + "loss": 0.80057847, + "num_input_tokens_seen": 216767584, + "router_z_loss_mlp": 0.17675781, + "step": 2600, + "time_per_iteration": 2.735712766647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093225, + "balance_loss_mlp": 1.07491398, + "epoch": 0.5003847633705272, + "flos": 533798180352.0, + "grad_norm": 0.0850656909263446, + "language_loss": 0.89518678, + "learning_rate": 0.0005236683674177264, + "loss": 0.90611899, + "num_input_tokens_seen": 216834320, + "router_z_loss_mlp": 0.18310547, + "step": 2601, + "time_per_iteration": 2.7013046741485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101472, + "balance_loss_mlp": 1.08336401, + "epoch": 0.5005771450557907, + "flos": 737789285376.0, + "grad_norm": 0.06829559635091415, + "language_loss": 0.82179487, + "learning_rate": 0.0005233571697760021, + "loss": 0.83280951, + "num_input_tokens_seen": 216907312, + "router_z_loss_mlp": 0.18103027, + "step": 2602, + "time_per_iteration": 2.902503490447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101813, + "balance_loss_mlp": 1.08420539, + "epoch": 0.5007695267410542, + "flos": 778977865728.0, + "grad_norm": 0.10152220944898022, + "language_loss": 0.82961535, + "learning_rate": 0.0005230459630662203, + "loss": 0.84063351, + "num_input_tokens_seen": 216979872, + "router_z_loss_mlp": 0.17626953, + "step": 2603, + "time_per_iteration": 2.966848134994507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108628, + "balance_loss_mlp": 1.09103274, + "epoch": 0.5009619084263178, + "flos": 623476694016.0, + "grad_norm": 0.07939636618021073, + "language_loss": 0.8145076, + "learning_rate": 0.0005227347474092022, + "loss": 0.82559389, + "num_input_tokens_seen": 217054000, + "router_z_loss_mlp": 0.17602539, + "step": 2604, + "time_per_iteration": 2.76577091217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107422, + "balance_loss_mlp": 1.08948135, + "epoch": 0.5011542901115814, + "flos": 531087611904.0, + "grad_norm": 0.06357584490296206, + "language_loss": 0.82990885, + "learning_rate": 0.0005224235229257724, + "loss": 0.84098309, + "num_input_tokens_seen": 217126784, + "router_z_loss_mlp": 0.17956543, + "step": 2605, + "time_per_iteration": 2.798074245452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108194, + "balance_loss_mlp": 1.09092093, + "epoch": 0.5013466717968449, + "flos": 527534581248.0, + "grad_norm": 0.059877769950401664, + "language_loss": 0.86506116, + "learning_rate": 0.0005221122897367589, + "loss": 0.8761431, + "num_input_tokens_seen": 217203056, + "router_z_loss_mlp": 0.17285156, + "step": 2606, + "time_per_iteration": 2.8442416191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120744, + "balance_loss_mlp": 1.10386384, + "epoch": 0.5015390534821085, + "flos": 566017735680.0, + "grad_norm": 0.08858636737693353, + "language_loss": 0.81257951, + "learning_rate": 0.0005218010479629932, + "loss": 0.82378697, + "num_input_tokens_seen": 217273280, + "router_z_loss_mlp": 0.16882324, + "step": 2607, + "time_per_iteration": 2.720196485519409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112059, + "balance_loss_mlp": 1.09503603, + "epoch": 0.5017314351673721, + "flos": 566697212928.0, + "grad_norm": 0.09219088613115281, + "language_loss": 0.82021785, + "learning_rate": 0.0005214897977253102, + "loss": 0.83133841, + "num_input_tokens_seen": 217345568, + "router_z_loss_mlp": 0.17041016, + "step": 2608, + "time_per_iteration": 2.6824939250946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104466, + "balance_loss_mlp": 1.08703792, + "epoch": 0.5019238168526357, + "flos": 522291483648.0, + "grad_norm": 0.05892482680876805, + "language_loss": 0.84221715, + "learning_rate": 0.0005211785391445473, + "loss": 0.85326183, + "num_input_tokens_seen": 217422848, + "router_z_loss_mlp": 0.17456055, + "step": 2609, + "time_per_iteration": 2.72525954246521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.08809578, + "epoch": 0.5021161985378992, + "flos": 641434567680.0, + "grad_norm": 0.07489132465153774, + "language_loss": 0.79042387, + "learning_rate": 0.0005208672723415467, + "loss": 0.80148035, + "num_input_tokens_seen": 217502896, + "router_z_loss_mlp": 0.17553711, + "step": 2610, + "time_per_iteration": 2.8028247356414795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110106, + "balance_loss_mlp": 1.08385801, + "epoch": 0.5023085802231627, + "flos": 591284302848.0, + "grad_norm": 0.08294073768606391, + "language_loss": 0.7915107, + "learning_rate": 0.0005205559974371525, + "loss": 0.80252123, + "num_input_tokens_seen": 217575072, + "router_z_loss_mlp": 0.17211914, + "step": 2611, + "time_per_iteration": 2.7850143909454346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094923, + "balance_loss_mlp": 1.07810235, + "epoch": 0.5025009619084263, + "flos": 472373586432.0, + "grad_norm": 0.07295315460395477, + "language_loss": 0.82193494, + "learning_rate": 0.0005202447145522123, + "loss": 0.83288413, + "num_input_tokens_seen": 217644976, + "router_z_loss_mlp": 0.16821289, + "step": 2612, + "time_per_iteration": 2.700307607650757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090548, + "balance_loss_mlp": 1.07344127, + "epoch": 0.5026933435936899, + "flos": 455139606528.0, + "grad_norm": 0.0792727031944949, + "language_loss": 0.79256612, + "learning_rate": 0.0005199334238075769, + "loss": 0.80347157, + "num_input_tokens_seen": 217712816, + "router_z_loss_mlp": 0.17126465, + "step": 2613, + "time_per_iteration": 2.6153087615966797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089787, + "balance_loss_mlp": 1.07271576, + "epoch": 0.5028857252789535, + "flos": 491747314176.0, + "grad_norm": 0.08033639738386796, + "language_loss": 0.91661727, + "learning_rate": 0.0005196221253241, + "loss": 0.92751515, + "num_input_tokens_seen": 217780256, + "router_z_loss_mlp": 0.17089844, + "step": 2614, + "time_per_iteration": 2.6069750785827637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088826, + "balance_loss_mlp": 1.07155263, + "epoch": 0.503078106964217, + "flos": 625569454080.0, + "grad_norm": 0.07969948054344475, + "language_loss": 0.82871294, + "learning_rate": 0.0005193108192226383, + "loss": 0.83960116, + "num_input_tokens_seen": 217848496, + "router_z_loss_mlp": 0.17272949, + "step": 2615, + "time_per_iteration": 2.8156328201293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084024, + "balance_loss_mlp": 1.06673825, + "epoch": 0.5032704886494805, + "flos": 579046371840.0, + "grad_norm": 0.06296322155163143, + "language_loss": 0.86797768, + "learning_rate": 0.000518999505624052, + "loss": 0.87881792, + "num_input_tokens_seen": 217919216, + "router_z_loss_mlp": 0.1730957, + "step": 2616, + "time_per_iteration": 2.7152223587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080227, + "balance_loss_mlp": 1.06292999, + "epoch": 0.5034628703347441, + "flos": 471753206784.0, + "grad_norm": 0.05958638296552923, + "language_loss": 0.83317488, + "learning_rate": 0.000518688184649203, + "loss": 0.84397715, + "num_input_tokens_seen": 217996096, + "router_z_loss_mlp": 0.17297363, + "step": 2617, + "time_per_iteration": 2.8284754753112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_mlp": 1.06272697, + "epoch": 0.5036552520200077, + "flos": 489837362688.0, + "grad_norm": 0.07368279711977406, + "language_loss": 0.83787394, + "learning_rate": 0.0005183768564189577, + "loss": 0.84867823, + "num_input_tokens_seen": 218063072, + "router_z_loss_mlp": 0.17724609, + "step": 2618, + "time_per_iteration": 2.591064453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.06613898, + "epoch": 0.5038476337052713, + "flos": 494235426816.0, + "grad_norm": 0.08850035073541652, + "language_loss": 0.81363833, + "learning_rate": 0.0005180655210541838, + "loss": 0.82447004, + "num_input_tokens_seen": 218131056, + "router_z_loss_mlp": 0.17041016, + "step": 2619, + "time_per_iteration": 2.5832765102386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086664, + "balance_loss_mlp": 1.06910443, + "epoch": 0.5040400153905348, + "flos": 600604263936.0, + "grad_norm": 0.09602250816000424, + "language_loss": 0.8361724, + "learning_rate": 0.0005177541786757527, + "loss": 0.8470391, + "num_input_tokens_seen": 218203536, + "router_z_loss_mlp": 0.17565918, + "step": 2620, + "time_per_iteration": 2.8272600173950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081312, + "balance_loss_mlp": 1.0633707, + "epoch": 0.5042323970757984, + "flos": 811525962240.0, + "grad_norm": 0.08634316495635827, + "language_loss": 0.82817882, + "learning_rate": 0.000517442829404538, + "loss": 0.838992, + "num_input_tokens_seen": 218283008, + "router_z_loss_mlp": 0.17956543, + "step": 2621, + "time_per_iteration": 3.0231099128723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108588, + "balance_loss_mlp": 1.06736684, + "epoch": 0.504424778761062, + "flos": 627308706816.0, + "grad_norm": 0.07086048560872778, + "language_loss": 0.87109387, + "learning_rate": 0.0005171314733614166, + "loss": 0.88195264, + "num_input_tokens_seen": 218362096, + "router_z_loss_mlp": 0.18505859, + "step": 2622, + "time_per_iteration": 2.924490213394165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092184, + "balance_loss_mlp": 1.07450485, + "epoch": 0.5046171604463255, + "flos": 515911887360.0, + "grad_norm": 0.09670552238526126, + "language_loss": 0.78441215, + "learning_rate": 0.0005168201106672671, + "loss": 0.79533398, + "num_input_tokens_seen": 218439440, + "router_z_loss_mlp": 0.17700195, + "step": 2623, + "time_per_iteration": 2.7627530097961426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081433, + "balance_loss_mlp": 1.06351626, + "epoch": 0.504809542131589, + "flos": 527831188992.0, + "grad_norm": 0.07080566946451637, + "language_loss": 0.8469494, + "learning_rate": 0.0005165087414429717, + "loss": 0.85776377, + "num_input_tokens_seen": 218505936, + "router_z_loss_mlp": 0.17932129, + "step": 2624, + "time_per_iteration": 2.6216189861297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078588, + "balance_loss_mlp": 1.06013489, + "epoch": 0.5050019238168526, + "flos": 554118257664.0, + "grad_norm": 0.07518378231968396, + "language_loss": 0.83469629, + "learning_rate": 0.0005161973658094144, + "loss": 0.84548217, + "num_input_tokens_seen": 218573824, + "router_z_loss_mlp": 0.18444824, + "step": 2625, + "time_per_iteration": 2.686030864715576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077122, + "balance_loss_mlp": 1.05919266, + "epoch": 0.5051943055021162, + "flos": 574774216704.0, + "grad_norm": 0.07052814404413787, + "language_loss": 0.82367003, + "learning_rate": 0.000515885983887482, + "loss": 0.83444118, + "num_input_tokens_seen": 218648016, + "router_z_loss_mlp": 0.17944336, + "step": 2626, + "time_per_iteration": 2.742265224456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073389, + "balance_loss_mlp": 1.05478024, + "epoch": 0.5053866871873798, + "flos": 496686463488.0, + "grad_norm": 0.0761715011076948, + "language_loss": 0.84318763, + "learning_rate": 0.0005155745957980636, + "loss": 0.85392147, + "num_input_tokens_seen": 218714128, + "router_z_loss_mlp": 0.18615723, + "step": 2627, + "time_per_iteration": 2.6049954891204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074823, + "balance_loss_mlp": 1.05586839, + "epoch": 0.5055790688726434, + "flos": 502213685760.0, + "grad_norm": 0.07614118511738227, + "language_loss": 0.88045084, + "learning_rate": 0.000515263201662051, + "loss": 0.89119911, + "num_input_tokens_seen": 218784800, + "router_z_loss_mlp": 0.18945312, + "step": 2628, + "time_per_iteration": 2.7101621627807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084597, + "balance_loss_mlp": 1.06617892, + "epoch": 0.5057714505579068, + "flos": 845227809792.0, + "grad_norm": 0.07415998964954142, + "language_loss": 0.82280606, + "learning_rate": 0.0005149518016003378, + "loss": 0.83365202, + "num_input_tokens_seen": 218868256, + "router_z_loss_mlp": 0.1842041, + "step": 2629, + "time_per_iteration": 3.194669723510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080493, + "balance_loss_mlp": 1.06227767, + "epoch": 0.5059638322431704, + "flos": 497825533440.0, + "grad_norm": 0.07616905133259881, + "language_loss": 0.8214519, + "learning_rate": 0.0005146403957338206, + "loss": 0.83225679, + "num_input_tokens_seen": 218932496, + "router_z_loss_mlp": 0.18212891, + "step": 2630, + "time_per_iteration": 2.6495327949523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092259, + "balance_loss_mlp": 1.07468796, + "epoch": 0.506156213928434, + "flos": 617843013120.0, + "grad_norm": 0.06296513552488332, + "language_loss": 0.81962919, + "learning_rate": 0.0005143289841833975, + "loss": 0.8305518, + "num_input_tokens_seen": 219010672, + "router_z_loss_mlp": 0.17578125, + "step": 2631, + "time_per_iteration": 2.8716421127319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092491, + "balance_loss_mlp": 1.07512259, + "epoch": 0.5063485956136976, + "flos": 424857166848.0, + "grad_norm": 0.0779936416436138, + "language_loss": 0.82076275, + "learning_rate": 0.0005140175670699696, + "loss": 0.83168757, + "num_input_tokens_seen": 219077104, + "router_z_loss_mlp": 0.17382812, + "step": 2632, + "time_per_iteration": 2.6159043312072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.07069623, + "epoch": 0.5065409772989612, + "flos": 569926471680.0, + "grad_norm": 0.053505876641590386, + "language_loss": 0.82692468, + "learning_rate": 0.0005137061445144395, + "loss": 0.83781052, + "num_input_tokens_seen": 219164880, + "router_z_loss_mlp": 0.17895508, + "step": 2633, + "time_per_iteration": 2.9435369968414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102566, + "balance_loss_mlp": 1.08499455, + "epoch": 0.5067333589842247, + "flos": 628801284096.0, + "grad_norm": 0.07429237358898076, + "language_loss": 0.86728698, + "learning_rate": 0.000513394716637712, + "loss": 0.87831259, + "num_input_tokens_seen": 219237376, + "router_z_loss_mlp": 0.17590332, + "step": 2634, + "time_per_iteration": 2.785621404647827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_mlp": 1.02165747, + "epoch": 0.5069257406694883, + "flos": 1447867187712.0, + "grad_norm": 0.025420781551357425, + "language_loss": 0.79191709, + "learning_rate": 0.0005130832835606946, + "loss": 0.80223238, + "num_input_tokens_seen": 219467632, + "router_z_loss_mlp": 0.09863281, + "step": 2635, + "time_per_iteration": 4.87060809135437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103723, + "balance_loss_mlp": 1.08666396, + "epoch": 0.5071181223547518, + "flos": 638835227136.0, + "grad_norm": 0.0808554701524121, + "language_loss": 0.8102541, + "learning_rate": 0.0005127718454042958, + "loss": 0.82129133, + "num_input_tokens_seen": 219545392, + "router_z_loss_mlp": 0.1706543, + "step": 2636, + "time_per_iteration": 2.8784031867980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102685, + "balance_loss_mlp": 1.08523273, + "epoch": 0.5073105040400154, + "flos": 713565241344.0, + "grad_norm": 0.07186288747403746, + "language_loss": 0.84171808, + "learning_rate": 0.0005124604022894269, + "loss": 0.85274494, + "num_input_tokens_seen": 219623104, + "router_z_loss_mlp": 0.17468262, + "step": 2637, + "time_per_iteration": 2.9495620727539062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018568, + "balance_loss_mlp": 1.00903082, + "epoch": 0.5075028857252789, + "flos": 1436447126016.0, + "grad_norm": 0.013467544944548519, + "language_loss": 0.77188224, + "learning_rate": 0.000512148954337001, + "loss": 0.78206789, + "num_input_tokens_seen": 219853328, + "router_z_loss_mlp": 0.09521484, + "step": 2638, + "time_per_iteration": 4.841961145401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100977, + "balance_loss_mlp": 1.08402538, + "epoch": 0.5076952674105425, + "flos": 571147034112.0, + "grad_norm": 0.0754060533252176, + "language_loss": 0.83016658, + "learning_rate": 0.0005118375016679325, + "loss": 0.84117633, + "num_input_tokens_seen": 219925024, + "router_z_loss_mlp": 0.16967773, + "step": 2639, + "time_per_iteration": 2.7659313678741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094497, + "balance_loss_mlp": 1.07784295, + "epoch": 0.5078876490958061, + "flos": 516712504320.0, + "grad_norm": 0.08036414838520123, + "language_loss": 0.80592823, + "learning_rate": 0.0005115260444031382, + "loss": 0.81687325, + "num_input_tokens_seen": 219992752, + "router_z_loss_mlp": 0.16662598, + "step": 2640, + "time_per_iteration": 2.6009633541107178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012329, + "balance_loss_mlp": 1.00350785, + "epoch": 0.5080800307810697, + "flos": 1584224428032.0, + "grad_norm": 0.011999730841431432, + "language_loss": 0.78731823, + "learning_rate": 0.000511214582663537, + "loss": 0.79744148, + "num_input_tokens_seen": 220224160, + "router_z_loss_mlp": 0.08837891, + "step": 2641, + "time_per_iteration": 4.949390411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097313, + "balance_loss_mlp": 1.08012342, + "epoch": 0.5082724124663333, + "flos": 485209502208.0, + "grad_norm": 0.07347538330964974, + "language_loss": 0.87067777, + "learning_rate": 0.0005109031165700483, + "loss": 0.88165087, + "num_input_tokens_seen": 220289504, + "router_z_loss_mlp": 0.17211914, + "step": 2642, + "time_per_iteration": 2.571359634399414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089013, + "balance_loss_mlp": 1.07212138, + "epoch": 0.5084647941515967, + "flos": 682230366720.0, + "grad_norm": 0.07982577059913512, + "language_loss": 0.8353101, + "learning_rate": 0.0005105916462435945, + "loss": 0.84620023, + "num_input_tokens_seen": 220361376, + "router_z_loss_mlp": 0.16894531, + "step": 2643, + "time_per_iteration": 2.853332996368408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090538, + "balance_loss_mlp": 1.07358634, + "epoch": 0.5086571758368603, + "flos": 548736768000.0, + "grad_norm": 0.06767023016464803, + "language_loss": 0.85332114, + "learning_rate": 0.0005102801718050989, + "loss": 0.86422646, + "num_input_tokens_seen": 220434720, + "router_z_loss_mlp": 0.16967773, + "step": 2644, + "time_per_iteration": 2.71907377243042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085318, + "balance_loss_mlp": 1.06869972, + "epoch": 0.5088495575221239, + "flos": 564016379904.0, + "grad_norm": 0.08980112743883228, + "language_loss": 0.89031243, + "learning_rate": 0.0005099686933754867, + "loss": 0.9011656, + "num_input_tokens_seen": 220506208, + "router_z_loss_mlp": 0.16625977, + "step": 2645, + "time_per_iteration": 2.759768009185791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108757, + "balance_loss_mlp": 1.07075, + "epoch": 0.5090419392073875, + "flos": 551407689216.0, + "grad_norm": 0.07519563415405216, + "language_loss": 0.84095073, + "learning_rate": 0.0005096572110756845, + "loss": 0.85182643, + "num_input_tokens_seen": 220577456, + "router_z_loss_mlp": 0.16833496, + "step": 2646, + "time_per_iteration": 2.742478132247925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.06656277, + "epoch": 0.509234320892651, + "flos": 567779383296.0, + "grad_norm": 0.06876057003625125, + "language_loss": 0.85465425, + "learning_rate": 0.0005093457250266205, + "loss": 0.86549312, + "num_input_tokens_seen": 220649648, + "router_z_loss_mlp": 0.17333984, + "step": 2647, + "time_per_iteration": 2.762909173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091086, + "balance_loss_mlp": 1.073717, + "epoch": 0.5094267025779146, + "flos": 582609314304.0, + "grad_norm": 0.1044353617825215, + "language_loss": 0.8341682, + "learning_rate": 0.000509034235349224, + "loss": 0.84507906, + "num_input_tokens_seen": 220721168, + "router_z_loss_mlp": 0.1739502, + "step": 2648, + "time_per_iteration": 2.726165533065796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109791, + "balance_loss_mlp": 1.08109021, + "epoch": 0.5096190842631781, + "flos": 591990944256.0, + "grad_norm": 0.07313436933557896, + "language_loss": 0.81423604, + "learning_rate": 0.0005087227421644266, + "loss": 0.8252151, + "num_input_tokens_seen": 220796464, + "router_z_loss_mlp": 0.16821289, + "step": 2649, + "time_per_iteration": 2.753390312194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108015, + "balance_loss_mlp": 1.09102726, + "epoch": 0.5098114659484417, + "flos": 513562166784.0, + "grad_norm": 0.0718220857310726, + "language_loss": 0.85905892, + "learning_rate": 0.0005084112455931602, + "loss": 0.87013906, + "num_input_tokens_seen": 220862976, + "router_z_loss_mlp": 0.17004395, + "step": 2650, + "time_per_iteration": 2.5981361865997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116436, + "balance_loss_mlp": 1.0991627, + "epoch": 0.5100038476337053, + "flos": 484631341056.0, + "grad_norm": 0.0710139819724768, + "language_loss": 0.84867871, + "learning_rate": 0.0005080997457563586, + "loss": 0.85984302, + "num_input_tokens_seen": 220926432, + "router_z_loss_mlp": 0.17297363, + "step": 2651, + "time_per_iteration": 2.5604488849639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125127, + "balance_loss_mlp": 1.10802007, + "epoch": 0.5101962293189688, + "flos": 461603266560.0, + "grad_norm": 0.08475984872157578, + "language_loss": 0.78772122, + "learning_rate": 0.0005077882427749569, + "loss": 0.79897249, + "num_input_tokens_seen": 220993008, + "router_z_loss_mlp": 0.17114258, + "step": 2652, + "time_per_iteration": 2.5588836669921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137152, + "balance_loss_mlp": 1.12011659, + "epoch": 0.5103886110042324, + "flos": 587034542592.0, + "grad_norm": 0.0878101507805391, + "language_loss": 0.84672785, + "learning_rate": 0.0005074767367698913, + "loss": 0.85809934, + "num_input_tokens_seen": 221059248, + "router_z_loss_mlp": 0.17041016, + "step": 2653, + "time_per_iteration": 2.7424826622009277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113443, + "balance_loss_mlp": 1.11758542, + "epoch": 0.510580992689496, + "flos": 845260116480.0, + "grad_norm": 0.10879937034210539, + "language_loss": 0.83426005, + "learning_rate": 0.0005071652278620988, + "loss": 0.8456043, + "num_input_tokens_seen": 221133712, + "router_z_loss_mlp": 0.16845703, + "step": 2654, + "time_per_iteration": 3.09969162940979 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124784, + "balance_loss_mlp": 1.10785651, + "epoch": 0.5107733743747596, + "flos": 658624131072.0, + "grad_norm": 0.10475987580925356, + "language_loss": 0.83118153, + "learning_rate": 0.0005068537161725186, + "loss": 0.8424294, + "num_input_tokens_seen": 221202192, + "router_z_loss_mlp": 0.16943359, + "step": 2655, + "time_per_iteration": 2.82289719581604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116916, + "balance_loss_mlp": 1.09999979, + "epoch": 0.510965756060023, + "flos": 701732574720.0, + "grad_norm": 0.07925993280329827, + "language_loss": 0.84691739, + "learning_rate": 0.0005065422018220893, + "loss": 0.85808647, + "num_input_tokens_seen": 221277104, + "router_z_loss_mlp": 0.16931152, + "step": 2656, + "time_per_iteration": 2.8794078826904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112009, + "balance_loss_mlp": 1.09535527, + "epoch": 0.5111581377452866, + "flos": 559731741696.0, + "grad_norm": 0.07178639525503218, + "language_loss": 0.80310833, + "learning_rate": 0.0005062306849317521, + "loss": 0.81422836, + "num_input_tokens_seen": 221352320, + "router_z_loss_mlp": 0.16662598, + "step": 2657, + "time_per_iteration": 2.814025402069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110163, + "balance_loss_mlp": 1.09374762, + "epoch": 0.5113505194305502, + "flos": 609024863232.0, + "grad_norm": 0.09425319021973573, + "language_loss": 0.83069956, + "learning_rate": 0.0005059191656224487, + "loss": 0.84180123, + "num_input_tokens_seen": 221421056, + "router_z_loss_mlp": 0.16418457, + "step": 2658, + "time_per_iteration": 2.7602522373199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110977, + "balance_loss_mlp": 1.09316397, + "epoch": 0.5115429011158138, + "flos": 534477657600.0, + "grad_norm": 0.10010645818095278, + "language_loss": 0.89003229, + "learning_rate": 0.0005056076440151212, + "loss": 0.90113008, + "num_input_tokens_seen": 221492064, + "router_z_loss_mlp": 0.1661377, + "step": 2659, + "time_per_iteration": 2.7027831077575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071712, + "balance_loss_mlp": 1.06413066, + "epoch": 0.5117352828010774, + "flos": 1362213780480.0, + "grad_norm": 0.039772151853185514, + "language_loss": 0.76288116, + "learning_rate": 0.0005052961202307133, + "loss": 0.77359831, + "num_input_tokens_seen": 221724672, + "router_z_loss_mlp": 0.07568359, + "step": 2660, + "time_per_iteration": 4.856590032577515 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115167, + "balance_loss_mlp": 1.09887075, + "epoch": 0.5119276644863409, + "flos": 633740433408.0, + "grad_norm": 0.06725256479668422, + "language_loss": 0.86826003, + "learning_rate": 0.0005049845943901691, + "loss": 0.87941164, + "num_input_tokens_seen": 221800144, + "router_z_loss_mlp": 0.16296387, + "step": 2661, + "time_per_iteration": 2.8570423126220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122122, + "balance_loss_mlp": 1.10631514, + "epoch": 0.5121200461716044, + "flos": 585598864896.0, + "grad_norm": 0.0894536064907193, + "language_loss": 0.8667441, + "learning_rate": 0.0005046730666144338, + "loss": 0.87796533, + "num_input_tokens_seen": 221877168, + "router_z_loss_mlp": 0.15795898, + "step": 2662, + "time_per_iteration": 2.883822202682495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119148, + "balance_loss_mlp": 1.10315049, + "epoch": 0.512312427856868, + "flos": 1032508767744.0, + "grad_norm": 0.06658438993973123, + "language_loss": 0.87964702, + "learning_rate": 0.0005043615370244532, + "loss": 0.8908385, + "num_input_tokens_seen": 221964208, + "router_z_loss_mlp": 0.15991211, + "step": 2663, + "time_per_iteration": 3.388521671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_mlp": 1.02103686, + "epoch": 0.5125048095421316, + "flos": 1537983645696.0, + "grad_norm": 0.01281563800895277, + "language_loss": 0.78244388, + "learning_rate": 0.0005040500057411736, + "loss": 0.79272962, + "num_input_tokens_seen": 222179264, + "router_z_loss_mlp": 0.07519531, + "step": 2664, + "time_per_iteration": 4.6337666511535645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119325, + "balance_loss_mlp": 1.10361338, + "epoch": 0.5126971912273951, + "flos": 591116175360.0, + "grad_norm": 0.058968241204554794, + "language_loss": 0.85154796, + "learning_rate": 0.0005037384728855425, + "loss": 0.86274123, + "num_input_tokens_seen": 222259504, + "router_z_loss_mlp": 0.15698242, + "step": 2665, + "time_per_iteration": 2.8316938877105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116454, + "balance_loss_mlp": 1.10032547, + "epoch": 0.5128895729126587, + "flos": 551657309184.0, + "grad_norm": 0.07313815870373463, + "language_loss": 0.8427707, + "learning_rate": 0.0005034269385785075, + "loss": 0.85393524, + "num_input_tokens_seen": 222330512, + "router_z_loss_mlp": 0.16125488, + "step": 2666, + "time_per_iteration": 2.705953359603882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119966, + "balance_loss_mlp": 1.10405147, + "epoch": 0.5130819545979223, + "flos": 481271030784.0, + "grad_norm": 0.09131160106886373, + "language_loss": 0.84140623, + "learning_rate": 0.0005031154029410168, + "loss": 0.85260594, + "num_input_tokens_seen": 222394000, + "router_z_loss_mlp": 0.15905762, + "step": 2667, + "time_per_iteration": 2.5483505725860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121112, + "balance_loss_mlp": 1.10497081, + "epoch": 0.5132743362831859, + "flos": 475798136832.0, + "grad_norm": 0.07350853386407429, + "language_loss": 0.86393219, + "learning_rate": 0.0005028038660940197, + "loss": 0.87514335, + "num_input_tokens_seen": 222459344, + "router_z_loss_mlp": 0.16137695, + "step": 2668, + "time_per_iteration": 2.5729174613952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117082, + "balance_loss_mlp": 1.10103667, + "epoch": 0.5134667179684494, + "flos": 503827029504.0, + "grad_norm": 0.06973928207648594, + "language_loss": 0.84257567, + "learning_rate": 0.0005024923281584648, + "loss": 0.85374653, + "num_input_tokens_seen": 222528912, + "router_z_loss_mlp": 0.16040039, + "step": 2669, + "time_per_iteration": 2.695422410964966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112964, + "balance_loss_mlp": 1.11378479, + "epoch": 0.5136590996537129, + "flos": 503918433792.0, + "grad_norm": 0.07121106891997668, + "language_loss": 0.82480651, + "learning_rate": 0.0005021807892553026, + "loss": 0.8361029, + "num_input_tokens_seen": 222604704, + "router_z_loss_mlp": 0.15844727, + "step": 2670, + "time_per_iteration": 2.751401662826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129611, + "balance_loss_mlp": 1.11330318, + "epoch": 0.5138514813389765, + "flos": 624623104512.0, + "grad_norm": 0.07354407823714339, + "language_loss": 0.84572917, + "learning_rate": 0.0005018692495054828, + "loss": 0.85702527, + "num_input_tokens_seen": 222677888, + "router_z_loss_mlp": 0.16308594, + "step": 2671, + "time_per_iteration": 2.757593870162964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123606, + "balance_loss_mlp": 1.10785806, + "epoch": 0.5140438630242401, + "flos": 583545752064.0, + "grad_norm": 0.06661441717787603, + "language_loss": 0.80650961, + "learning_rate": 0.0005015577090299561, + "loss": 0.81774569, + "num_input_tokens_seen": 222751936, + "router_z_loss_mlp": 0.15734863, + "step": 2672, + "time_per_iteration": 2.693725347518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_mlp": 1.09435153, + "epoch": 0.5142362447095037, + "flos": 487927411200.0, + "grad_norm": 0.07298787487316409, + "language_loss": 0.86515582, + "learning_rate": 0.0005012461679496729, + "loss": 0.87626314, + "num_input_tokens_seen": 222819616, + "router_z_loss_mlp": 0.16381836, + "step": 2673, + "time_per_iteration": 2.6318869590759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111417, + "balance_loss_mlp": 1.09533608, + "epoch": 0.5144286263947672, + "flos": 526857675264.0, + "grad_norm": 0.07740296935823926, + "language_loss": 0.87230647, + "learning_rate": 0.0005009346263855848, + "loss": 0.88342059, + "num_input_tokens_seen": 222888448, + "router_z_loss_mlp": 0.1607666, + "step": 2674, + "time_per_iteration": 2.6561901569366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108221, + "balance_loss_mlp": 1.09159088, + "epoch": 0.5146210080800308, + "flos": 486518897664.0, + "grad_norm": 0.0608007463380774, + "language_loss": 0.83338469, + "learning_rate": 0.0005006230844586422, + "loss": 0.84446692, + "num_input_tokens_seen": 222964736, + "router_z_loss_mlp": 0.16638184, + "step": 2675, + "time_per_iteration": 2.7956371307373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110833, + "balance_loss_mlp": 1.09186745, + "epoch": 0.5148133897652943, + "flos": 515892063744.0, + "grad_norm": 0.06956599587127472, + "language_loss": 0.78915107, + "learning_rate": 0.0005003115422897968, + "loss": 0.80023432, + "num_input_tokens_seen": 223040944, + "router_z_loss_mlp": 0.16467285, + "step": 2676, + "time_per_iteration": 2.8026392459869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098426, + "balance_loss_mlp": 1.08178461, + "epoch": 0.5150057714505579, + "flos": 511212446208.0, + "grad_norm": 0.06380905094740742, + "language_loss": 0.87044096, + "learning_rate": 0.0005, + "loss": 0.8814252, + "num_input_tokens_seen": 223109632, + "router_z_loss_mlp": 0.16650391, + "step": 2677, + "time_per_iteration": 2.6397616863250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096356, + "balance_loss_mlp": 1.07940435, + "epoch": 0.5151981531358215, + "flos": 910909877760.0, + "grad_norm": 0.06972488542821374, + "language_loss": 0.79243249, + "learning_rate": 0.0004996884577102033, + "loss": 0.80339611, + "num_input_tokens_seen": 223191648, + "router_z_loss_mlp": 0.16967773, + "step": 2678, + "time_per_iteration": 3.1194515228271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109136, + "balance_loss_mlp": 1.07438445, + "epoch": 0.515390534821085, + "flos": 471864434688.0, + "grad_norm": 0.07627965924369287, + "language_loss": 0.84695083, + "learning_rate": 0.000499376915541358, + "loss": 0.85786444, + "num_input_tokens_seen": 223265920, + "router_z_loss_mlp": 0.16992188, + "step": 2679, + "time_per_iteration": 2.7068095207214355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089943, + "balance_loss_mlp": 1.07359934, + "epoch": 0.5155829165063486, + "flos": 650119468032.0, + "grad_norm": 0.06818096885322372, + "language_loss": 0.81243503, + "learning_rate": 0.0004990653736144155, + "loss": 0.8233344, + "num_input_tokens_seen": 223340688, + "router_z_loss_mlp": 0.16345215, + "step": 2680, + "time_per_iteration": 2.8939812183380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108849, + "balance_loss_mlp": 1.07127619, + "epoch": 0.5157752981916122, + "flos": 414262315008.0, + "grad_norm": 0.06989870799279192, + "language_loss": 0.85872787, + "learning_rate": 0.0004987538320503271, + "loss": 0.86961281, + "num_input_tokens_seen": 223404064, + "router_z_loss_mlp": 0.17236328, + "step": 2681, + "time_per_iteration": 2.5216612815856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082053, + "balance_loss_mlp": 1.06468463, + "epoch": 0.5159676798768758, + "flos": 553841473536.0, + "grad_norm": 0.08598338754099338, + "language_loss": 0.82912159, + "learning_rate": 0.0004984422909700442, + "loss": 0.8399421, + "num_input_tokens_seen": 223476784, + "router_z_loss_mlp": 0.17382812, + "step": 2682, + "time_per_iteration": 2.665601968765259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081809, + "balance_loss_mlp": 1.06371331, + "epoch": 0.5161600615621393, + "flos": 586510709760.0, + "grad_norm": 0.06868623883512981, + "language_loss": 0.8358953, + "learning_rate": 0.0004981307504945173, + "loss": 0.84671342, + "num_input_tokens_seen": 223542832, + "router_z_loss_mlp": 0.1809082, + "step": 2683, + "time_per_iteration": 2.744506597518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084499, + "balance_loss_mlp": 1.06714213, + "epoch": 0.5163524432474028, + "flos": 588843177984.0, + "grad_norm": 0.07139371766694287, + "language_loss": 0.89118385, + "learning_rate": 0.0004978192107446976, + "loss": 0.9020288, + "num_input_tokens_seen": 223617968, + "router_z_loss_mlp": 0.17370605, + "step": 2684, + "time_per_iteration": 2.840625762939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107857, + "balance_loss_mlp": 1.06075978, + "epoch": 0.5165448249326664, + "flos": 503893840896.0, + "grad_norm": 0.07781566774681065, + "language_loss": 0.87333429, + "learning_rate": 0.0004975076718415353, + "loss": 0.88411999, + "num_input_tokens_seen": 223689504, + "router_z_loss_mlp": 0.17810059, + "step": 2685, + "time_per_iteration": 2.6297128200531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076856, + "balance_loss_mlp": 1.05923653, + "epoch": 0.51673720661793, + "flos": 416760339456.0, + "grad_norm": 0.07734898237902697, + "language_loss": 0.90289825, + "learning_rate": 0.0004971961339059806, + "loss": 0.91366684, + "num_input_tokens_seen": 223752288, + "router_z_loss_mlp": 0.17626953, + "step": 2686, + "time_per_iteration": 2.5235214233398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079451, + "balance_loss_mlp": 1.06149805, + "epoch": 0.5169295883031936, + "flos": 598971096576.0, + "grad_norm": 0.08309998288602231, + "language_loss": 0.84119761, + "learning_rate": 0.0004968845970589832, + "loss": 0.85199213, + "num_input_tokens_seen": 223822304, + "router_z_loss_mlp": 0.17956543, + "step": 2687, + "time_per_iteration": 2.6999969482421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085317, + "balance_loss_mlp": 1.06760216, + "epoch": 0.517121969988457, + "flos": 556816343040.0, + "grad_norm": 0.0817039791962864, + "language_loss": 0.84468675, + "learning_rate": 0.0004965730614214926, + "loss": 0.85553992, + "num_input_tokens_seen": 223888592, + "router_z_loss_mlp": 0.17724609, + "step": 2688, + "time_per_iteration": 2.658827066421509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078556, + "balance_loss_mlp": 1.06094825, + "epoch": 0.5173143516737206, + "flos": 469445704704.0, + "grad_norm": 0.07334441433702203, + "language_loss": 0.85342443, + "learning_rate": 0.0004962615271144576, + "loss": 0.86421001, + "num_input_tokens_seen": 223952880, + "router_z_loss_mlp": 0.17626953, + "step": 2689, + "time_per_iteration": 2.50878643989563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086966, + "balance_loss_mlp": 1.06994319, + "epoch": 0.5175067333589842, + "flos": 720065977344.0, + "grad_norm": 0.12467871415324963, + "language_loss": 0.82284343, + "learning_rate": 0.0004959499942588264, + "loss": 0.83371305, + "num_input_tokens_seen": 224030000, + "router_z_loss_mlp": 0.17028809, + "step": 2690, + "time_per_iteration": 2.9249496459960938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0104705, + "balance_loss_mlp": 1.03822827, + "epoch": 0.5176991150442478, + "flos": 1466188480512.0, + "grad_norm": 0.03199266467607697, + "language_loss": 0.78200024, + "learning_rate": 0.0004956384629755469, + "loss": 0.79247075, + "num_input_tokens_seen": 224252384, + "router_z_loss_mlp": 0.08837891, + "step": 2691, + "time_per_iteration": 4.82594108581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090274, + "balance_loss_mlp": 1.07309616, + "epoch": 0.5178914967295114, + "flos": 612632222208.0, + "grad_norm": 0.07423408614425925, + "language_loss": 0.85369182, + "learning_rate": 0.0004953269333855661, + "loss": 0.86459452, + "num_input_tokens_seen": 224324640, + "router_z_loss_mlp": 0.17175293, + "step": 2692, + "time_per_iteration": 2.777863025665283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093446, + "balance_loss_mlp": 1.07593369, + "epoch": 0.5180838784147749, + "flos": 500926311936.0, + "grad_norm": 0.08941680356551608, + "language_loss": 0.84251738, + "learning_rate": 0.0004950154056098309, + "loss": 0.85345179, + "num_input_tokens_seen": 224398368, + "router_z_loss_mlp": 0.17529297, + "step": 2693, + "time_per_iteration": 2.7481398582458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097937, + "balance_loss_mlp": 1.08010364, + "epoch": 0.5182762601000385, + "flos": 688832418816.0, + "grad_norm": 0.07099923409869693, + "language_loss": 0.84394872, + "learning_rate": 0.0004947038797692867, + "loss": 0.85492814, + "num_input_tokens_seen": 224465456, + "router_z_loss_mlp": 0.1784668, + "step": 2694, + "time_per_iteration": 2.8453128337860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113818, + "balance_loss_mlp": 1.096771, + "epoch": 0.518468641785302, + "flos": 665611623936.0, + "grad_norm": 0.06154827687851128, + "language_loss": 0.77520609, + "learning_rate": 0.0004943923559848789, + "loss": 0.78634429, + "num_input_tokens_seen": 224540960, + "router_z_loss_mlp": 0.1706543, + "step": 2695, + "time_per_iteration": 2.841853141784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124321, + "balance_loss_mlp": 1.10654736, + "epoch": 0.5186610234705656, + "flos": 566714465280.0, + "grad_norm": 0.06645104429405103, + "language_loss": 0.90406942, + "learning_rate": 0.0004940808343775515, + "loss": 0.91531265, + "num_input_tokens_seen": 224613200, + "router_z_loss_mlp": 0.17773438, + "step": 2696, + "time_per_iteration": 2.749504327774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118087, + "balance_loss_mlp": 1.10027719, + "epoch": 0.5188534051558291, + "flos": 428879702016.0, + "grad_norm": 0.07841169466401897, + "language_loss": 0.82063687, + "learning_rate": 0.0004937693150682479, + "loss": 0.83181769, + "num_input_tokens_seen": 224677456, + "router_z_loss_mlp": 0.17810059, + "step": 2697, + "time_per_iteration": 2.5522847175598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118895, + "balance_loss_mlp": 1.10168159, + "epoch": 0.5190457868410927, + "flos": 546349971456.0, + "grad_norm": 0.07394243959698338, + "language_loss": 0.76709116, + "learning_rate": 0.0004934577981779107, + "loss": 0.77828008, + "num_input_tokens_seen": 224745600, + "router_z_loss_mlp": 0.17224121, + "step": 2698, + "time_per_iteration": 2.72316312789917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115498, + "balance_loss_mlp": 1.09879637, + "epoch": 0.5192381685263563, + "flos": 548605716480.0, + "grad_norm": 0.0912267088784467, + "language_loss": 0.8119272, + "learning_rate": 0.0004931462838274817, + "loss": 0.82308215, + "num_input_tokens_seen": 224826944, + "router_z_loss_mlp": 0.16711426, + "step": 2699, + "time_per_iteration": 2.8209919929504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107121, + "balance_loss_mlp": 1.08981156, + "epoch": 0.5194305502116199, + "flos": 575263544832.0, + "grad_norm": 0.10066489144579434, + "language_loss": 0.83903617, + "learning_rate": 0.0004928347721379011, + "loss": 0.85010743, + "num_input_tokens_seen": 224895280, + "router_z_loss_mlp": 0.17333984, + "step": 2700, + "time_per_iteration": 2.679414749145508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098221, + "balance_loss_mlp": 1.08088803, + "epoch": 0.5196229318968835, + "flos": 434258620416.0, + "grad_norm": 0.06308374672073903, + "language_loss": 0.82055807, + "learning_rate": 0.0004925232632301089, + "loss": 0.83154029, + "num_input_tokens_seen": 224961632, + "router_z_loss_mlp": 0.17346191, + "step": 2701, + "time_per_iteration": 2.5568413734436035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086175, + "balance_loss_mlp": 1.06934261, + "epoch": 0.5198153135821469, + "flos": 558881938944.0, + "grad_norm": 0.07257701027520803, + "language_loss": 0.79591668, + "learning_rate": 0.0004922117572250431, + "loss": 0.80677843, + "num_input_tokens_seen": 225032816, + "router_z_loss_mlp": 0.16845703, + "step": 2702, + "time_per_iteration": 2.6907496452331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085203, + "balance_loss_mlp": 1.06819224, + "epoch": 0.5200076952674105, + "flos": 565684051968.0, + "grad_norm": 0.08909916825126464, + "language_loss": 0.80501723, + "learning_rate": 0.0004919002542436414, + "loss": 0.81586921, + "num_input_tokens_seen": 225112736, + "router_z_loss_mlp": 0.17016602, + "step": 2703, + "time_per_iteration": 2.8154964447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087916, + "balance_loss_mlp": 1.07078612, + "epoch": 0.5202000769526741, + "flos": 571186681344.0, + "grad_norm": 0.07574293506029897, + "language_loss": 0.8094272, + "learning_rate": 0.0004915887544068399, + "loss": 0.82030636, + "num_input_tokens_seen": 225182672, + "router_z_loss_mlp": 0.17138672, + "step": 2704, + "time_per_iteration": 2.6723296642303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080297, + "balance_loss_mlp": 1.06322646, + "epoch": 0.5203924586379377, + "flos": 694211337216.0, + "grad_norm": 0.08223729103851085, + "language_loss": 0.78410661, + "learning_rate": 0.0004912772578355736, + "loss": 0.79490954, + "num_input_tokens_seen": 225260272, + "router_z_loss_mlp": 0.1706543, + "step": 2705, + "time_per_iteration": 2.904359817504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080431, + "balance_loss_mlp": 1.06288326, + "epoch": 0.5205848403232012, + "flos": 566509261824.0, + "grad_norm": 0.0867272148609526, + "language_loss": 0.82534099, + "learning_rate": 0.000490965764650776, + "loss": 0.83614528, + "num_input_tokens_seen": 225337120, + "router_z_loss_mlp": 0.17553711, + "step": 2706, + "time_per_iteration": 2.893965005874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082267, + "balance_loss_mlp": 1.06508923, + "epoch": 0.5207772220084648, + "flos": 1214259932160.0, + "grad_norm": 0.08899008608425168, + "language_loss": 0.82646501, + "learning_rate": 0.0004906542749733798, + "loss": 0.83728766, + "num_input_tokens_seen": 225433984, + "router_z_loss_mlp": 0.171875, + "step": 2707, + "time_per_iteration": 3.642857313156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081324, + "balance_loss_mlp": 1.06468248, + "epoch": 0.5209696036937284, + "flos": 592843318272.0, + "grad_norm": 0.06383765372803735, + "language_loss": 0.85145414, + "learning_rate": 0.0004903427889243156, + "loss": 0.86226737, + "num_input_tokens_seen": 225512112, + "router_z_loss_mlp": 0.16650391, + "step": 2708, + "time_per_iteration": 2.8898375034332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091262, + "balance_loss_mlp": 1.074036, + "epoch": 0.5211619853789919, + "flos": 522889468416.0, + "grad_norm": 0.07905445780966364, + "language_loss": 0.85149866, + "learning_rate": 0.0004900313066245134, + "loss": 0.86241126, + "num_input_tokens_seen": 225586944, + "router_z_loss_mlp": 0.17236328, + "step": 2709, + "time_per_iteration": 2.65574049949646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088104, + "balance_loss_mlp": 1.07130718, + "epoch": 0.5213543670642555, + "flos": 502799187456.0, + "grad_norm": 0.07812284997006956, + "language_loss": 0.80880928, + "learning_rate": 0.0004897198281949012, + "loss": 0.81969029, + "num_input_tokens_seen": 225657184, + "router_z_loss_mlp": 0.16796875, + "step": 2710, + "time_per_iteration": 2.672153949737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103259, + "balance_loss_mlp": 1.08604503, + "epoch": 0.521546748749519, + "flos": 585959712768.0, + "grad_norm": 0.07691692452987973, + "language_loss": 0.77799213, + "learning_rate": 0.0004894083537564057, + "loss": 0.78902471, + "num_input_tokens_seen": 225729968, + "router_z_loss_mlp": 0.17236328, + "step": 2711, + "time_per_iteration": 2.7532706260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104375, + "balance_loss_mlp": 1.08732796, + "epoch": 0.5217391304347826, + "flos": 570119192064.0, + "grad_norm": 0.07306223578012608, + "language_loss": 0.80945504, + "learning_rate": 0.0004890968834299519, + "loss": 0.82049876, + "num_input_tokens_seen": 225801808, + "router_z_loss_mlp": 0.1706543, + "step": 2712, + "time_per_iteration": 2.7456612586975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113403, + "balance_loss_mlp": 1.09663057, + "epoch": 0.5219315121200462, + "flos": 542784457728.0, + "grad_norm": 0.06414784694166918, + "language_loss": 0.7858941, + "learning_rate": 0.0004887854173364633, + "loss": 0.79702818, + "num_input_tokens_seen": 225878576, + "router_z_loss_mlp": 0.16784668, + "step": 2713, + "time_per_iteration": 2.731410503387451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116912, + "balance_loss_mlp": 1.10033011, + "epoch": 0.5221238938053098, + "flos": 550310464512.0, + "grad_norm": 0.062429546921528134, + "language_loss": 0.8127901, + "learning_rate": 0.0004884739555968617, + "loss": 0.82395923, + "num_input_tokens_seen": 225960096, + "router_z_loss_mlp": 0.16589355, + "step": 2714, + "time_per_iteration": 2.8288521766662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024153, + "balance_loss_mlp": 1.01604629, + "epoch": 0.5223162754905732, + "flos": 1355174157312.0, + "grad_norm": 0.017358883808072843, + "language_loss": 0.78977054, + "learning_rate": 0.0004881624983320676, + "loss": 0.80001205, + "num_input_tokens_seen": 226184960, + "router_z_loss_mlp": 0.08105469, + "step": 2715, + "time_per_iteration": 5.007716417312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124661, + "balance_loss_mlp": 1.10728037, + "epoch": 0.5225086571758368, + "flos": 567747076608.0, + "grad_norm": 0.06973573346877397, + "language_loss": 0.86611319, + "learning_rate": 0.0004878510456629992, + "loss": 0.87735981, + "num_input_tokens_seen": 226271328, + "router_z_loss_mlp": 0.17407227, + "step": 2716, + "time_per_iteration": 3.006253957748413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131765, + "balance_loss_mlp": 1.11461031, + "epoch": 0.5227010388611004, + "flos": 500158001664.0, + "grad_norm": 0.07218030120275976, + "language_loss": 0.85169446, + "learning_rate": 0.00048753959771057314, + "loss": 0.86301208, + "num_input_tokens_seen": 226340080, + "router_z_loss_mlp": 0.17175293, + "step": 2717, + "time_per_iteration": 2.6976563930511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121586, + "balance_loss_mlp": 1.10383558, + "epoch": 0.522893420546364, + "flos": 597656558592.0, + "grad_norm": 0.07681806180198643, + "language_loss": 0.82615161, + "learning_rate": 0.0004872281545957044, + "loss": 0.83736753, + "num_input_tokens_seen": 226415120, + "router_z_loss_mlp": 0.1776123, + "step": 2718, + "time_per_iteration": 2.8015332221984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117931, + "balance_loss_mlp": 1.10027635, + "epoch": 0.5230858022316276, + "flos": 664605803520.0, + "grad_norm": 0.058351443586734386, + "language_loss": 0.85597366, + "learning_rate": 0.0004869167164393055, + "loss": 0.86715293, + "num_input_tokens_seen": 226501200, + "router_z_loss_mlp": 0.17675781, + "step": 2719, + "time_per_iteration": 2.9708495140075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116486, + "balance_loss_mlp": 1.09911728, + "epoch": 0.5232781839168911, + "flos": 603843434496.0, + "grad_norm": 0.06620613765458017, + "language_loss": 0.88742125, + "learning_rate": 0.00048660528336228793, + "loss": 0.89858615, + "num_input_tokens_seen": 226582064, + "router_z_loss_mlp": 0.17382812, + "step": 2720, + "time_per_iteration": 2.7995879650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106216, + "balance_loss_mlp": 1.08846569, + "epoch": 0.5234705656021547, + "flos": 550718300160.0, + "grad_norm": 0.06179859794056996, + "language_loss": 0.90307331, + "learning_rate": 0.0004862938554855606, + "loss": 0.91413546, + "num_input_tokens_seen": 226656448, + "router_z_loss_mlp": 0.1776123, + "step": 2721, + "time_per_iteration": 2.8321540355682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104126, + "balance_loss_mlp": 1.08690071, + "epoch": 0.5236629472874182, + "flos": 504279281664.0, + "grad_norm": 0.07085532730134622, + "language_loss": 0.85930234, + "learning_rate": 0.0004859824329300304, + "loss": 0.87034363, + "num_input_tokens_seen": 226725568, + "router_z_loss_mlp": 0.17248535, + "step": 2722, + "time_per_iteration": 2.6302812099456787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110407, + "balance_loss_mlp": 1.08649826, + "epoch": 0.5238553289726818, + "flos": 547654597632.0, + "grad_norm": 0.07263306317055565, + "language_loss": 0.83477378, + "learning_rate": 0.00048567101581660244, + "loss": 0.84581447, + "num_input_tokens_seen": 226795728, + "router_z_loss_mlp": 0.17590332, + "step": 2723, + "time_per_iteration": 2.68910813331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109903, + "balance_loss_mlp": 1.08181643, + "epoch": 0.5240477106579453, + "flos": 531962380800.0, + "grad_norm": 0.11439626446879424, + "language_loss": 0.87057537, + "learning_rate": 0.00048535960426617956, + "loss": 0.88156569, + "num_input_tokens_seen": 226865344, + "router_z_loss_mlp": 0.17236328, + "step": 2724, + "time_per_iteration": 2.622817039489746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.07238674, + "epoch": 0.5242400923432089, + "flos": 617939559936.0, + "grad_norm": 0.061793488209652164, + "language_loss": 0.8146565, + "learning_rate": 0.0004850481983996621, + "loss": 0.8255589, + "num_input_tokens_seen": 226936800, + "router_z_loss_mlp": 0.17871094, + "step": 2725, + "time_per_iteration": 2.7661449909210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.07968855, + "epoch": 0.5244324740284725, + "flos": 416686187520.0, + "grad_norm": 0.1002744758401102, + "language_loss": 0.87726384, + "learning_rate": 0.0004847367983379492, + "loss": 0.8882367, + "num_input_tokens_seen": 226998448, + "router_z_loss_mlp": 0.17602539, + "step": 2726, + "time_per_iteration": 2.501094341278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096174, + "balance_loss_mlp": 1.0795207, + "epoch": 0.5246248557137361, + "flos": 626436509184.0, + "grad_norm": 0.06877444759134967, + "language_loss": 0.78732175, + "learning_rate": 0.00048442540420193643, + "loss": 0.79828346, + "num_input_tokens_seen": 227081872, + "router_z_loss_mlp": 0.16662598, + "step": 2727, + "time_per_iteration": 2.9280529022216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091328, + "balance_loss_mlp": 1.07391191, + "epoch": 0.5248172373989997, + "flos": 1248463590912.0, + "grad_norm": 0.07855483173762376, + "language_loss": 0.79334521, + "learning_rate": 0.0004841140161125182, + "loss": 0.80425853, + "num_input_tokens_seen": 227167744, + "router_z_loss_mlp": 0.17431641, + "step": 2728, + "time_per_iteration": 3.626858711242676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093412, + "balance_loss_mlp": 1.07654381, + "epoch": 0.5250096190842631, + "flos": 506868710400.0, + "grad_norm": 0.08285412332857332, + "language_loss": 0.8463819, + "learning_rate": 0.0004838026341905857, + "loss": 0.85731602, + "num_input_tokens_seen": 227239136, + "router_z_loss_mlp": 0.16870117, + "step": 2729, + "time_per_iteration": 2.7793312072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088713, + "balance_loss_mlp": 1.07182097, + "epoch": 0.5252020007695267, + "flos": 611317684224.0, + "grad_norm": 0.07499858641848273, + "language_loss": 0.85196304, + "learning_rate": 0.00048349125855702844, + "loss": 0.86285013, + "num_input_tokens_seen": 227311968, + "router_z_loss_mlp": 0.16906738, + "step": 2730, + "time_per_iteration": 2.8079419136047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092888, + "balance_loss_mlp": 1.07605541, + "epoch": 0.5253943824547903, + "flos": 539233998336.0, + "grad_norm": 0.07740216541040414, + "language_loss": 0.81396556, + "learning_rate": 0.00048317988933273287, + "loss": 0.82489449, + "num_input_tokens_seen": 227385248, + "router_z_loss_mlp": 0.16845703, + "step": 2731, + "time_per_iteration": 2.772430419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084718, + "balance_loss_mlp": 1.06807661, + "epoch": 0.5255867641400539, + "flos": 698038580736.0, + "grad_norm": 0.18745226220584338, + "language_loss": 0.82080007, + "learning_rate": 0.00048286852663858367, + "loss": 0.83164728, + "num_input_tokens_seen": 227464640, + "router_z_loss_mlp": 0.16650391, + "step": 2732, + "time_per_iteration": 2.9268972873687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087343, + "balance_loss_mlp": 1.07036781, + "epoch": 0.5257791458253175, + "flos": 667289207808.0, + "grad_norm": 0.08325512934533874, + "language_loss": 0.8380754, + "learning_rate": 0.000482557170595462, + "loss": 0.84894884, + "num_input_tokens_seen": 227542192, + "router_z_loss_mlp": 0.16992188, + "step": 2733, + "time_per_iteration": 2.8951096534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093841, + "balance_loss_mlp": 1.07677019, + "epoch": 0.525971527510581, + "flos": 483620751360.0, + "grad_norm": 0.08900957978988387, + "language_loss": 0.87469298, + "learning_rate": 0.0004822458213242475, + "loss": 0.88563132, + "num_input_tokens_seen": 227606096, + "router_z_loss_mlp": 0.17089844, + "step": 2734, + "time_per_iteration": 2.5620529651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110054, + "balance_loss_mlp": 1.09249437, + "epoch": 0.5261639091958445, + "flos": 829916264448.0, + "grad_norm": 0.0633406501514696, + "language_loss": 0.85937345, + "learning_rate": 0.00048193447894581627, + "loss": 0.87047398, + "num_input_tokens_seen": 227689552, + "router_z_loss_mlp": 0.17565918, + "step": 2735, + "time_per_iteration": 3.103132486343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118378, + "balance_loss_mlp": 1.10083008, + "epoch": 0.5263562908811081, + "flos": 520715215872.0, + "grad_norm": 0.0756952830822362, + "language_loss": 0.87890029, + "learning_rate": 0.00048162314358104243, + "loss": 0.89008415, + "num_input_tokens_seen": 227760784, + "router_z_loss_mlp": 0.17565918, + "step": 2736, + "time_per_iteration": 2.6416001319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117726, + "balance_loss_mlp": 1.10027409, + "epoch": 0.5265486725663717, + "flos": 574996672512.0, + "grad_norm": 0.09251963370546762, + "language_loss": 0.83179659, + "learning_rate": 0.0004813118153507969, + "loss": 0.84297383, + "num_input_tokens_seen": 227834304, + "router_z_loss_mlp": 0.17468262, + "step": 2737, + "time_per_iteration": 2.7370142936706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078212, + "balance_loss_mlp": 1.0679127, + "epoch": 0.5267410542516352, + "flos": 1547261015040.0, + "grad_norm": 0.03576440897911325, + "language_loss": 0.82447124, + "learning_rate": 0.0004810004943759482, + "loss": 0.83525336, + "num_input_tokens_seen": 228057232, + "router_z_loss_mlp": 0.10302734, + "step": 2738, + "time_per_iteration": 4.797177076339722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110775, + "balance_loss_mlp": 1.08933258, + "epoch": 0.5269334359368988, + "flos": 929952493056.0, + "grad_norm": 0.07588810399495584, + "language_loss": 0.83266842, + "learning_rate": 0.00048068918077736163, + "loss": 0.84374589, + "num_input_tokens_seen": 228140816, + "router_z_loss_mlp": 0.18408203, + "step": 2739, + "time_per_iteration": 3.2253060340881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109875, + "balance_loss_mlp": 1.0805707, + "epoch": 0.5271258176221624, + "flos": 655389729792.0, + "grad_norm": 0.07650809384335877, + "language_loss": 0.81149924, + "learning_rate": 0.0004803778746759001, + "loss": 0.82248676, + "num_input_tokens_seen": 228216208, + "router_z_loss_mlp": 0.18188477, + "step": 2740, + "time_per_iteration": 2.917982578277588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091336, + "balance_loss_mlp": 1.07380056, + "epoch": 0.527318199307426, + "flos": 543036648960.0, + "grad_norm": 0.08493152657291815, + "language_loss": 0.81563872, + "learning_rate": 0.00048006657619242317, + "loss": 0.82655203, + "num_input_tokens_seen": 228283184, + "router_z_loss_mlp": 0.17553711, + "step": 2741, + "time_per_iteration": 2.6491029262542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083991, + "balance_loss_mlp": 1.0661335, + "epoch": 0.5275105809926895, + "flos": 447882670080.0, + "grad_norm": 0.09642753382189671, + "language_loss": 0.78573406, + "learning_rate": 0.00047975528544778775, + "loss": 0.79657394, + "num_input_tokens_seen": 228351328, + "router_z_loss_mlp": 0.17858887, + "step": 2742, + "time_per_iteration": 2.6600565910339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080617, + "balance_loss_mlp": 1.06256926, + "epoch": 0.527702962677953, + "flos": 578935143936.0, + "grad_norm": 0.07268225763303592, + "language_loss": 0.88256997, + "learning_rate": 0.00047944400256284754, + "loss": 0.89337611, + "num_input_tokens_seen": 228423632, + "router_z_loss_mlp": 0.18041992, + "step": 2743, + "time_per_iteration": 2.7662084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108031, + "balance_loss_mlp": 1.06228542, + "epoch": 0.5278953443632166, + "flos": 652773136896.0, + "grad_norm": 0.07011617815169531, + "language_loss": 0.79666251, + "learning_rate": 0.0004791327276584532, + "loss": 0.80746561, + "num_input_tokens_seen": 228498736, + "router_z_loss_mlp": 0.18041992, + "step": 2744, + "time_per_iteration": 2.835545301437378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075943, + "balance_loss_mlp": 1.05737054, + "epoch": 0.5280877260484802, + "flos": 514001935872.0, + "grad_norm": 0.08121623581547996, + "language_loss": 0.80470204, + "learning_rate": 0.00047882146085545264, + "loss": 0.81546152, + "num_input_tokens_seen": 228569056, + "router_z_loss_mlp": 0.18566895, + "step": 2745, + "time_per_iteration": 2.690206289291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_mlp": 1.02781987, + "epoch": 0.5282801077337438, + "flos": 1445460567552.0, + "grad_norm": 0.02647915133994321, + "language_loss": 0.75402379, + "learning_rate": 0.00047851020227469, + "loss": 0.76439977, + "num_input_tokens_seen": 228800560, + "router_z_loss_mlp": 0.09765625, + "step": 2746, + "time_per_iteration": 5.020122766494751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074184, + "balance_loss_mlp": 1.05564749, + "epoch": 0.5284724894190073, + "flos": 604856595456.0, + "grad_norm": 0.0832805570330896, + "language_loss": 0.79321563, + "learning_rate": 0.00047819895203700684, + "loss": 0.80395758, + "num_input_tokens_seen": 228869216, + "router_z_loss_mlp": 0.18530273, + "step": 2747, + "time_per_iteration": 2.770418167114258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_mlp": 1.02084875, + "epoch": 0.5286648711042709, + "flos": 1494956321280.0, + "grad_norm": 0.025219008400043496, + "language_loss": 0.75512433, + "learning_rate": 0.0004778877102632412, + "loss": 0.76542532, + "num_input_tokens_seen": 229085520, + "router_z_loss_mlp": 0.09228516, + "step": 2748, + "time_per_iteration": 4.670547246932983 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.04841685, + "epoch": 0.5288572527895344, + "flos": 597616911360.0, + "grad_norm": 0.08023961077007181, + "language_loss": 0.88480437, + "learning_rate": 0.0004775764770742277, + "loss": 0.89546895, + "num_input_tokens_seen": 229160912, + "router_z_loss_mlp": 0.18041992, + "step": 2749, + "time_per_iteration": 2.8597028255462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074765, + "balance_loss_mlp": 1.05651426, + "epoch": 0.529049634474798, + "flos": 557320352256.0, + "grad_norm": 0.0872100074417497, + "language_loss": 0.86519742, + "learning_rate": 0.00047726525259079777, + "loss": 0.87594503, + "num_input_tokens_seen": 229235792, + "router_z_loss_mlp": 0.18237305, + "step": 2750, + "time_per_iteration": 2.7900798320770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080608, + "balance_loss_mlp": 1.06233358, + "epoch": 0.5292420161600616, + "flos": 581274952704.0, + "grad_norm": 0.10808949355702925, + "language_loss": 0.88474864, + "learning_rate": 0.0004769540369337798, + "loss": 0.89555472, + "num_input_tokens_seen": 229309984, + "router_z_loss_mlp": 0.18261719, + "step": 2751, + "time_per_iteration": 2.7448270320892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083505, + "balance_loss_mlp": 1.0650394, + "epoch": 0.5294343978453251, + "flos": 608303167488.0, + "grad_norm": 0.06879132043127602, + "language_loss": 0.85886008, + "learning_rate": 0.00047664283022399794, + "loss": 0.86969519, + "num_input_tokens_seen": 229394000, + "router_z_loss_mlp": 0.18469238, + "step": 2752, + "time_per_iteration": 2.8719866275787354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080341, + "balance_loss_mlp": 1.06261468, + "epoch": 0.5296267795305887, + "flos": 646522020864.0, + "grad_norm": 0.0740043611556158, + "language_loss": 0.81022358, + "learning_rate": 0.00047633163258227376, + "loss": 0.82102704, + "num_input_tokens_seen": 229474320, + "router_z_loss_mlp": 0.17736816, + "step": 2753, + "time_per_iteration": 2.904007911682129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108486, + "balance_loss_mlp": 1.06734776, + "epoch": 0.5298191612158523, + "flos": 559746796032.0, + "grad_norm": 0.07290364739094941, + "language_loss": 0.85516405, + "learning_rate": 0.0004760204441294247, + "loss": 0.86601269, + "num_input_tokens_seen": 229543072, + "router_z_loss_mlp": 0.17529297, + "step": 2754, + "time_per_iteration": 2.728672504425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095448, + "balance_loss_mlp": 1.07741165, + "epoch": 0.5300115429011159, + "flos": 514046352384.0, + "grad_norm": 0.0727695026629463, + "language_loss": 0.86100507, + "learning_rate": 0.00047570926498626486, + "loss": 0.87195957, + "num_input_tokens_seen": 229615296, + "router_z_loss_mlp": 0.18066406, + "step": 2755, + "time_per_iteration": 2.726902484893799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099745, + "balance_loss_mlp": 1.08242369, + "epoch": 0.5302039245863793, + "flos": 672789265920.0, + "grad_norm": 0.05921570741986168, + "language_loss": 0.81395233, + "learning_rate": 0.00047539809527360474, + "loss": 0.82494974, + "num_input_tokens_seen": 229693728, + "router_z_loss_mlp": 0.17333984, + "step": 2756, + "time_per_iteration": 2.87945556640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115628, + "balance_loss_mlp": 1.09774637, + "epoch": 0.5303963062716429, + "flos": 730836297216.0, + "grad_norm": 0.05551434768366506, + "language_loss": 0.82287431, + "learning_rate": 0.0004750869351122511, + "loss": 0.83403063, + "num_input_tokens_seen": 229772144, + "router_z_loss_mlp": 0.17883301, + "step": 2757, + "time_per_iteration": 3.0493249893188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112447, + "balance_loss_mlp": 1.10749459, + "epoch": 0.5305886879569065, + "flos": 573435085824.0, + "grad_norm": 0.0694425557197165, + "language_loss": 0.82020032, + "learning_rate": 0.00047477578462300685, + "loss": 0.83144498, + "num_input_tokens_seen": 229847024, + "router_z_loss_mlp": 0.16992188, + "step": 2758, + "time_per_iteration": 2.7602713108062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123309, + "balance_loss_mlp": 1.10578477, + "epoch": 0.5307810696421701, + "flos": 695335352832.0, + "grad_norm": 0.07804964416900076, + "language_loss": 0.79339695, + "learning_rate": 0.0004744646439266718, + "loss": 0.80463004, + "num_input_tokens_seen": 229932416, + "router_z_loss_mlp": 0.17541504, + "step": 2759, + "time_per_iteration": 3.010812997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119146, + "balance_loss_mlp": 1.10195613, + "epoch": 0.5309734513274337, + "flos": 648943322112.0, + "grad_norm": 0.056360612774155563, + "language_loss": 0.92028886, + "learning_rate": 0.000474153513144041, + "loss": 0.93148029, + "num_input_tokens_seen": 230010976, + "router_z_loss_mlp": 0.17199707, + "step": 2760, + "time_per_iteration": 2.9704673290252686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128006, + "balance_loss_mlp": 1.11117363, + "epoch": 0.5311658330126972, + "flos": 604824288768.0, + "grad_norm": 0.08001771173719906, + "language_loss": 0.86726296, + "learning_rate": 0.00047384239239590633, + "loss": 0.87854302, + "num_input_tokens_seen": 230093344, + "router_z_loss_mlp": 0.16845703, + "step": 2761, + "time_per_iteration": 2.891458749771118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129372, + "balance_loss_mlp": 1.11169338, + "epoch": 0.5313582146979607, + "flos": 558259361280.0, + "grad_norm": 0.06781273866770807, + "language_loss": 0.88723642, + "learning_rate": 0.0004735312818030556, + "loss": 0.89853013, + "num_input_tokens_seen": 230165520, + "router_z_loss_mlp": 0.17700195, + "step": 2762, + "time_per_iteration": 2.7164249420166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127323, + "balance_loss_mlp": 1.11076498, + "epoch": 0.5315505963832243, + "flos": 508410473472.0, + "grad_norm": 0.06505824064287292, + "language_loss": 0.82414401, + "learning_rate": 0.0004732201814862727, + "loss": 0.83541727, + "num_input_tokens_seen": 230237808, + "router_z_loss_mlp": 0.16564941, + "step": 2763, + "time_per_iteration": 2.726468563079834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123829, + "balance_loss_mlp": 1.10723543, + "epoch": 0.5317429780684879, + "flos": 626439080448.0, + "grad_norm": 0.06470267434285343, + "language_loss": 0.81489587, + "learning_rate": 0.0004729090915663373, + "loss": 0.82613409, + "num_input_tokens_seen": 230321568, + "router_z_loss_mlp": 0.16601562, + "step": 2764, + "time_per_iteration": 2.8475723266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123779, + "balance_loss_mlp": 1.10759008, + "epoch": 0.5319353597537514, + "flos": 476744486400.0, + "grad_norm": 0.11068637871952317, + "language_loss": 0.85001844, + "learning_rate": 0.00047259801216402534, + "loss": 0.86125624, + "num_input_tokens_seen": 230385376, + "router_z_loss_mlp": 0.16186523, + "step": 2765, + "time_per_iteration": 2.540780544281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116718, + "balance_loss_mlp": 1.10029066, + "epoch": 0.532127741439015, + "flos": 501635524608.0, + "grad_norm": 0.07674788190906832, + "language_loss": 0.86407942, + "learning_rate": 0.00047228694340010845, + "loss": 0.87524652, + "num_input_tokens_seen": 230449760, + "router_z_loss_mlp": 0.16430664, + "step": 2766, + "time_per_iteration": 2.590508460998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121045, + "balance_loss_mlp": 1.1044749, + "epoch": 0.5323201231242786, + "flos": 1164586512384.0, + "grad_norm": 0.07081285799421494, + "language_loss": 0.85664678, + "learning_rate": 0.0004719758853953544, + "loss": 0.86785722, + "num_input_tokens_seen": 230536592, + "router_z_loss_mlp": 0.16577148, + "step": 2767, + "time_per_iteration": 3.6536149978637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118047, + "balance_loss_mlp": 1.10160804, + "epoch": 0.5325125048095422, + "flos": 378702273024.0, + "grad_norm": 0.1001432749586202, + "language_loss": 0.83710611, + "learning_rate": 0.00047166483827052645, + "loss": 0.84828657, + "num_input_tokens_seen": 230596688, + "router_z_loss_mlp": 0.16442871, + "step": 2768, + "time_per_iteration": 2.437939167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01234354, + "balance_loss_mlp": 1.22538948, + "epoch": 0.5327048864948057, + "flos": 1541353121280.0, + "grad_norm": 0.06972612650118978, + "language_loss": 0.77078491, + "learning_rate": 0.00047135380214638413, + "loss": 0.78312844, + "num_input_tokens_seen": 230829408, + "router_z_loss_mlp": 0.08984375, + "step": 2769, + "time_per_iteration": 5.026838779449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115645, + "balance_loss_mlp": 1.09895587, + "epoch": 0.5328972681800692, + "flos": 911272923648.0, + "grad_norm": 0.0780544569178282, + "language_loss": 0.83743083, + "learning_rate": 0.000471042777143682, + "loss": 0.84858727, + "num_input_tokens_seen": 230912528, + "router_z_loss_mlp": 0.16699219, + "step": 2770, + "time_per_iteration": 3.230933427810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113101, + "balance_loss_mlp": 1.09710324, + "epoch": 0.5330896498653328, + "flos": 473898097152.0, + "grad_norm": 0.20675341395216595, + "language_loss": 0.79602915, + "learning_rate": 0.0004707317633831707, + "loss": 0.80716014, + "num_input_tokens_seen": 230979424, + "router_z_loss_mlp": 0.15991211, + "step": 2771, + "time_per_iteration": 2.6368706226348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106292, + "balance_loss_mlp": 1.09012711, + "epoch": 0.5332820315505964, + "flos": 501635524608.0, + "grad_norm": 0.0712649510509903, + "language_loss": 0.77926189, + "learning_rate": 0.00047042076098559673, + "loss": 0.79032481, + "num_input_tokens_seen": 231046416, + "router_z_loss_mlp": 0.16162109, + "step": 2772, + "time_per_iteration": 2.633755683898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105276, + "balance_loss_mlp": 1.08895612, + "epoch": 0.53347441323586, + "flos": 924439951872.0, + "grad_norm": 0.08177633680773212, + "language_loss": 0.74153018, + "learning_rate": 0.00047010977007170174, + "loss": 0.75258291, + "num_input_tokens_seen": 231136064, + "router_z_loss_mlp": 0.16320801, + "step": 2773, + "time_per_iteration": 3.257364273071289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105171, + "balance_loss_mlp": 1.08880353, + "epoch": 0.5336667949211235, + "flos": 574455587328.0, + "grad_norm": 0.08878543355304569, + "language_loss": 0.8234973, + "learning_rate": 0.00046979879076222334, + "loss": 0.83454895, + "num_input_tokens_seen": 231203616, + "router_z_loss_mlp": 0.16369629, + "step": 2774, + "time_per_iteration": 2.6948111057281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115652, + "balance_loss_mlp": 1.09958255, + "epoch": 0.533859176606387, + "flos": 1064664082944.0, + "grad_norm": 0.07031279684874672, + "language_loss": 0.84660083, + "learning_rate": 0.0004694878231778939, + "loss": 0.85775733, + "num_input_tokens_seen": 231287008, + "router_z_loss_mlp": 0.16064453, + "step": 2775, + "time_per_iteration": 3.391101121902466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111582, + "balance_loss_mlp": 1.09510732, + "epoch": 0.5340515582916506, + "flos": 746602665984.0, + "grad_norm": 0.06461927889010362, + "language_loss": 0.84379047, + "learning_rate": 0.0004691768674394423, + "loss": 0.85490632, + "num_input_tokens_seen": 231365296, + "router_z_loss_mlp": 0.16479492, + "step": 2776, + "time_per_iteration": 2.9977481365203857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_mlp": 1.03071785, + "epoch": 0.5342439399769142, + "flos": 1445685594624.0, + "grad_norm": 0.02105469632037268, + "language_loss": 0.84484011, + "learning_rate": 0.0004688659236675918, + "loss": 0.85523784, + "num_input_tokens_seen": 231579040, + "router_z_loss_mlp": 0.09082031, + "step": 2777, + "time_per_iteration": 4.769741535186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_mlp": 1.02591205, + "epoch": 0.5344363216621778, + "flos": 1427569505280.0, + "grad_norm": 0.019005935883373085, + "language_loss": 0.76653534, + "learning_rate": 0.00046855499198306187, + "loss": 0.77688694, + "num_input_tokens_seen": 231812736, + "router_z_loss_mlp": 0.09228516, + "step": 2778, + "time_per_iteration": 4.987689733505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118552, + "balance_loss_mlp": 1.10211313, + "epoch": 0.5346287033474413, + "flos": 527618644992.0, + "grad_norm": 0.06371644955079436, + "language_loss": 0.79125863, + "learning_rate": 0.00046824407250656676, + "loss": 0.80244416, + "num_input_tokens_seen": 231883840, + "router_z_loss_mlp": 0.16442871, + "step": 2779, + "time_per_iteration": 2.6410112380981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112995, + "balance_loss_mlp": 1.09662735, + "epoch": 0.5348210850327049, + "flos": 510762765312.0, + "grad_norm": 0.060742687445953125, + "language_loss": 0.83655095, + "learning_rate": 0.0004679331653588161, + "loss": 0.84768081, + "num_input_tokens_seen": 231955360, + "router_z_loss_mlp": 0.16369629, + "step": 2780, + "time_per_iteration": 2.625710964202881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112315, + "balance_loss_mlp": 1.09542346, + "epoch": 0.5350134667179685, + "flos": 462668184576.0, + "grad_norm": 0.07272998333963254, + "language_loss": 0.85177255, + "learning_rate": 0.0004676222706605147, + "loss": 0.86289573, + "num_input_tokens_seen": 232027088, + "router_z_loss_mlp": 0.16906738, + "step": 2781, + "time_per_iteration": 2.673433542251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110827, + "balance_loss_mlp": 1.09407806, + "epoch": 0.535205848403232, + "flos": 708875712000.0, + "grad_norm": 0.07193058078875894, + "language_loss": 0.85307002, + "learning_rate": 0.0004673113885323626, + "loss": 0.8641783, + "num_input_tokens_seen": 232099472, + "router_z_loss_mlp": 0.16748047, + "step": 2782, + "time_per_iteration": 2.8941664695739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106993, + "balance_loss_mlp": 1.09025598, + "epoch": 0.5353982300884956, + "flos": 894241575936.0, + "grad_norm": 0.10372367104553785, + "language_loss": 0.78561115, + "learning_rate": 0.00046700051909505494, + "loss": 0.79668105, + "num_input_tokens_seen": 232182528, + "router_z_loss_mlp": 0.16748047, + "step": 2783, + "time_per_iteration": 3.2081563472747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111085, + "balance_loss_mlp": 1.09330261, + "epoch": 0.5355906117737591, + "flos": 535965092352.0, + "grad_norm": 0.06865237294530599, + "language_loss": 0.83605123, + "learning_rate": 0.000466689662469282, + "loss": 0.84715974, + "num_input_tokens_seen": 232253344, + "router_z_loss_mlp": 0.17553711, + "step": 2784, + "time_per_iteration": 2.6711413860321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104532, + "balance_loss_mlp": 1.08773518, + "epoch": 0.5357829934590227, + "flos": 868846528512.0, + "grad_norm": 0.08186219318834767, + "language_loss": 0.83921355, + "learning_rate": 0.00046637881877572917, + "loss": 0.85025889, + "num_input_tokens_seen": 232337232, + "router_z_loss_mlp": 0.16809082, + "step": 2785, + "time_per_iteration": 3.1084179878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094145, + "balance_loss_mlp": 1.07644248, + "epoch": 0.5359753751442863, + "flos": 553287905280.0, + "grad_norm": 0.07421115565240126, + "language_loss": 0.84573698, + "learning_rate": 0.0004660679881350764, + "loss": 0.85667843, + "num_input_tokens_seen": 232412864, + "router_z_loss_mlp": 0.17736816, + "step": 2786, + "time_per_iteration": 2.7627315521240234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_mlp": 1.02681208, + "epoch": 0.5361677568295499, + "flos": 1480499347968.0, + "grad_norm": 0.02311153951998418, + "language_loss": 0.75608146, + "learning_rate": 0.0004657571706679988, + "loss": 0.76644635, + "num_input_tokens_seen": 232639888, + "router_z_loss_mlp": 0.09667969, + "step": 2787, + "time_per_iteration": 5.0513763427734375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086082, + "balance_loss_mlp": 1.06855869, + "epoch": 0.5363601385148133, + "flos": 806255700480.0, + "grad_norm": 0.07609779475010685, + "language_loss": 0.77801538, + "learning_rate": 0.0004654463664951667, + "loss": 0.78887624, + "num_input_tokens_seen": 232719248, + "router_z_loss_mlp": 0.17541504, + "step": 2788, + "time_per_iteration": 3.050717353820801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085404, + "balance_loss_mlp": 1.06829762, + "epoch": 0.5365525202000769, + "flos": 507879300096.0, + "grad_norm": 0.06896319927596091, + "language_loss": 0.82818955, + "learning_rate": 0.0004651355757372447, + "loss": 0.83904356, + "num_input_tokens_seen": 232788464, + "router_z_loss_mlp": 0.17126465, + "step": 2789, + "time_per_iteration": 2.621809244155884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108222, + "balance_loss_mlp": 1.064816, + "epoch": 0.5367449018853405, + "flos": 528930611712.0, + "grad_norm": 0.06368186458097214, + "language_loss": 0.85671151, + "learning_rate": 0.00046482479851489274, + "loss": 0.86753374, + "num_input_tokens_seen": 232859792, + "router_z_loss_mlp": 0.17431641, + "step": 2790, + "time_per_iteration": 2.6873245239257812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107657, + "balance_loss_mlp": 1.05957103, + "epoch": 0.5369372835706041, + "flos": 649934088192.0, + "grad_norm": 0.09368235748008798, + "language_loss": 0.77583152, + "learning_rate": 0.00046451403494876525, + "loss": 0.78659725, + "num_input_tokens_seen": 232941472, + "router_z_loss_mlp": 0.17016602, + "step": 2791, + "time_per_iteration": 2.9352025985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073051, + "balance_loss_mlp": 1.05570602, + "epoch": 0.5371296652558677, + "flos": 584489530368.0, + "grad_norm": 0.09106511666805264, + "language_loss": 0.84479213, + "learning_rate": 0.0004642032851595111, + "loss": 0.85552263, + "num_input_tokens_seen": 233017120, + "router_z_loss_mlp": 0.17358398, + "step": 2792, + "time_per_iteration": 2.8460757732391357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107606, + "balance_loss_mlp": 1.05853653, + "epoch": 0.5373220469411312, + "flos": 595872516096.0, + "grad_norm": 0.09557816920928826, + "language_loss": 0.84886861, + "learning_rate": 0.00046389254926777404, + "loss": 0.85962915, + "num_input_tokens_seen": 233095408, + "router_z_loss_mlp": 0.17541504, + "step": 2793, + "time_per_iteration": 2.8258917331695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_mlp": 1.05381024, + "epoch": 0.5375144286263948, + "flos": 1114426335744.0, + "grad_norm": 0.10419489870866282, + "language_loss": 0.78006279, + "learning_rate": 0.0004635818273941926, + "loss": 0.79077744, + "num_input_tokens_seen": 233191056, + "router_z_loss_mlp": 0.17675781, + "step": 2794, + "time_per_iteration": 3.5380136966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073554, + "balance_loss_mlp": 1.05581546, + "epoch": 0.5377068103116583, + "flos": 595608215040.0, + "grad_norm": 0.09943669711596623, + "language_loss": 0.81746304, + "learning_rate": 0.0004632711196593997, + "loss": 0.82819855, + "num_input_tokens_seen": 233265536, + "router_z_loss_mlp": 0.1776123, + "step": 2795, + "time_per_iteration": 2.780565023422241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076704, + "balance_loss_mlp": 1.05881083, + "epoch": 0.5378991919969219, + "flos": 884200292352.0, + "grad_norm": 0.08810005094672828, + "language_loss": 0.85034251, + "learning_rate": 0.00046296042618402297, + "loss": 0.86110961, + "num_input_tokens_seen": 233348224, + "router_z_loss_mlp": 0.17907715, + "step": 2796, + "time_per_iteration": 3.099726915359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076408, + "balance_loss_mlp": 1.0591228, + "epoch": 0.5380915736821854, + "flos": 710664523776.0, + "grad_norm": 0.06043623665913195, + "language_loss": 0.79098737, + "learning_rate": 0.0004626497470886839, + "loss": 0.80175149, + "num_input_tokens_seen": 233429344, + "router_z_loss_mlp": 0.17297363, + "step": 2797, + "time_per_iteration": 2.975820541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.538283955367449, + "flos": 556999151616.0, + "grad_norm": 0.06634785168506467, + "language_loss": 0.81794053, + "learning_rate": 0.00046233908249399897, + "loss": 0.82876945, + "num_input_tokens_seen": 233504944, + "router_z_loss_mlp": 0.1706543, + "step": 2798, + "time_per_iteration": 2.7805473804473877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086325, + "balance_loss_mlp": 1.06942129, + "epoch": 0.5384763370527126, + "flos": 513470762496.0, + "grad_norm": 0.07252012949911142, + "language_loss": 0.78733051, + "learning_rate": 0.00046202843252057905, + "loss": 0.79819375, + "num_input_tokens_seen": 233573072, + "router_z_loss_mlp": 0.16906738, + "step": 2799, + "time_per_iteration": 2.666600227355957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091679, + "balance_loss_mlp": 1.07437015, + "epoch": 0.5386687187379762, + "flos": 489736046592.0, + "grad_norm": 0.07864108960704319, + "language_loss": 0.83561981, + "learning_rate": 0.00046171779728902896, + "loss": 0.84653658, + "num_input_tokens_seen": 233640896, + "router_z_loss_mlp": 0.17333984, + "step": 2800, + "time_per_iteration": 2.6010262966156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094958, + "balance_loss_mlp": 1.07766032, + "epoch": 0.5388611004232398, + "flos": 482657149440.0, + "grad_norm": 0.11618067186279732, + "language_loss": 0.85997868, + "learning_rate": 0.000461407176919948, + "loss": 0.87092829, + "num_input_tokens_seen": 233703904, + "router_z_loss_mlp": 0.17321777, + "step": 2801, + "time_per_iteration": 2.5429272651672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094632, + "balance_loss_mlp": 1.07774007, + "epoch": 0.5390534821085032, + "flos": 560984610816.0, + "grad_norm": 0.08430832790687283, + "language_loss": 0.84795403, + "learning_rate": 0.00046109657153392997, + "loss": 0.85890037, + "num_input_tokens_seen": 233779248, + "router_z_loss_mlp": 0.16906738, + "step": 2802, + "time_per_iteration": 2.6846201419830322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108893, + "balance_loss_mlp": 1.07168102, + "epoch": 0.5392458637937668, + "flos": 488377092096.0, + "grad_norm": 0.08650976784842915, + "language_loss": 0.82548422, + "learning_rate": 0.0004607859812515622, + "loss": 0.83637351, + "num_input_tokens_seen": 233847520, + "router_z_loss_mlp": 0.17272949, + "step": 2803, + "time_per_iteration": 2.5817925930023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107998, + "balance_loss_mlp": 1.06338573, + "epoch": 0.5394382454790304, + "flos": 512057479680.0, + "grad_norm": 0.07563802138366026, + "language_loss": 0.87865353, + "learning_rate": 0.00046047540619342667, + "loss": 0.88945341, + "num_input_tokens_seen": 233911328, + "router_z_loss_mlp": 0.16601562, + "step": 2804, + "time_per_iteration": 2.6165053844451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083951, + "balance_loss_mlp": 1.06755948, + "epoch": 0.539630627164294, + "flos": 567586662912.0, + "grad_norm": 0.07064105870143675, + "language_loss": 0.79886174, + "learning_rate": 0.00046016484648009933, + "loss": 0.8097012, + "num_input_tokens_seen": 233987104, + "router_z_loss_mlp": 0.16394043, + "step": 2805, + "time_per_iteration": 2.725764274597168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084411, + "balance_loss_mlp": 1.06835365, + "epoch": 0.5398230088495575, + "flos": 526462322688.0, + "grad_norm": 0.07630556738551086, + "language_loss": 0.80977762, + "learning_rate": 0.0004598543022321501, + "loss": 0.82062167, + "num_input_tokens_seen": 234057216, + "router_z_loss_mlp": 0.16052246, + "step": 2806, + "time_per_iteration": 2.6351540088653564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085497, + "balance_loss_mlp": 1.06909394, + "epoch": 0.5400153905348211, + "flos": 538764493824.0, + "grad_norm": 0.0649087683342786, + "language_loss": 0.79606426, + "learning_rate": 0.0004595437735701433, + "loss": 0.80691922, + "num_input_tokens_seen": 234129984, + "router_z_loss_mlp": 0.1640625, + "step": 2807, + "time_per_iteration": 2.706876516342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085225, + "balance_loss_mlp": 1.06884575, + "epoch": 0.5402077722200846, + "flos": 513539771904.0, + "grad_norm": 0.08230029830948764, + "language_loss": 0.83224154, + "learning_rate": 0.00045923326061463623, + "loss": 0.84309381, + "num_input_tokens_seen": 234203920, + "router_z_loss_mlp": 0.16381836, + "step": 2808, + "time_per_iteration": 2.7869887351989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.07481909, + "epoch": 0.5404001539053482, + "flos": 676258232832.0, + "grad_norm": 0.06556687541720137, + "language_loss": 0.81677991, + "learning_rate": 0.00045892276348618113, + "loss": 0.82769144, + "num_input_tokens_seen": 234285440, + "router_z_loss_mlp": 0.16333008, + "step": 2809, + "time_per_iteration": 3.031975269317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_mlp": 1.03327227, + "epoch": 0.5405925355906118, + "flos": 1554834009600.0, + "grad_norm": 0.026553937309941048, + "language_loss": 0.78260827, + "learning_rate": 0.0004586122823053235, + "loss": 0.79302251, + "num_input_tokens_seen": 234521424, + "router_z_loss_mlp": 0.08154297, + "step": 2810, + "time_per_iteration": 5.018324613571167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097064, + "balance_loss_mlp": 1.08080387, + "epoch": 0.5407849172758753, + "flos": 647310154752.0, + "grad_norm": 0.07012301152495938, + "language_loss": 0.80724698, + "learning_rate": 0.000458301817192603, + "loss": 0.81821764, + "num_input_tokens_seen": 234601632, + "router_z_loss_mlp": 0.16259766, + "step": 2811, + "time_per_iteration": 2.8826699256896973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_mlp": 1.02369976, + "epoch": 0.5409772989611389, + "flos": 1407407643648.0, + "grad_norm": 0.020407688998465158, + "language_loss": 0.8084178, + "learning_rate": 0.00045799136826855263, + "loss": 0.81873488, + "num_input_tokens_seen": 234825776, + "router_z_loss_mlp": 0.08007812, + "step": 2812, + "time_per_iteration": 4.821629762649536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094803, + "balance_loss_mlp": 1.07879376, + "epoch": 0.5411696806464025, + "flos": 554389899264.0, + "grad_norm": 0.09349970811932752, + "language_loss": 0.87107521, + "learning_rate": 0.00045768093565369983, + "loss": 0.88202327, + "num_input_tokens_seen": 234901504, + "router_z_loss_mlp": 0.16003418, + "step": 2813, + "time_per_iteration": 2.798082113265991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096657, + "balance_loss_mlp": 1.08068299, + "epoch": 0.5413620623316661, + "flos": 528122654208.0, + "grad_norm": 0.08975534837118274, + "language_loss": 0.8179177, + "learning_rate": 0.0004573705194685646, + "loss": 0.82888424, + "num_input_tokens_seen": 234970288, + "router_z_loss_mlp": 0.15966797, + "step": 2814, + "time_per_iteration": 2.7093122005462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_mlp": 1.07979465, + "epoch": 0.5415544440169295, + "flos": 598741300224.0, + "grad_norm": 0.07912714625539458, + "language_loss": 0.85284495, + "learning_rate": 0.00045706011983366157, + "loss": 0.86380327, + "num_input_tokens_seen": 235039984, + "router_z_loss_mlp": 0.16027832, + "step": 2815, + "time_per_iteration": 2.736809253692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098087, + "balance_loss_mlp": 1.08264983, + "epoch": 0.5417468257021931, + "flos": 470757671424.0, + "grad_norm": 0.08398974332430421, + "language_loss": 0.82530612, + "learning_rate": 0.00045674973686949847, + "loss": 0.83628702, + "num_input_tokens_seen": 235105232, + "router_z_loss_mlp": 0.1541748, + "step": 2816, + "time_per_iteration": 2.531439781188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105366, + "balance_loss_mlp": 1.08896279, + "epoch": 0.5419392073874567, + "flos": 680819281920.0, + "grad_norm": 0.06449066246678943, + "language_loss": 0.85269451, + "learning_rate": 0.0004564393706965766, + "loss": 0.86374819, + "num_input_tokens_seen": 235192560, + "router_z_loss_mlp": 0.1640625, + "step": 2817, + "time_per_iteration": 3.0000851154327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112663, + "balance_loss_mlp": 1.0963788, + "epoch": 0.5421315890727203, + "flos": 462374148096.0, + "grad_norm": 0.0725055130640743, + "language_loss": 0.81484962, + "learning_rate": 0.00045612902143539116, + "loss": 0.82597625, + "num_input_tokens_seen": 235258448, + "router_z_loss_mlp": 0.1628418, + "step": 2818, + "time_per_iteration": 2.5587399005889893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117291, + "balance_loss_mlp": 1.10132849, + "epoch": 0.5423239707579839, + "flos": 436959277056.0, + "grad_norm": 0.0784970788328837, + "language_loss": 0.81825465, + "learning_rate": 0.00045581868920642986, + "loss": 0.82942754, + "num_input_tokens_seen": 235322176, + "router_z_loss_mlp": 0.1595459, + "step": 2819, + "time_per_iteration": 2.4901785850524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126012, + "balance_loss_mlp": 1.11031175, + "epoch": 0.5425163524432474, + "flos": 458314536960.0, + "grad_norm": 0.09999971886905719, + "language_loss": 0.79204059, + "learning_rate": 0.00045550837413017457, + "loss": 0.80330074, + "num_input_tokens_seen": 235390960, + "router_z_loss_mlp": 0.15686035, + "step": 2820, + "time_per_iteration": 2.616154909133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113009, + "balance_loss_mlp": 1.11416399, + "epoch": 0.542708734128511, + "flos": 419495500800.0, + "grad_norm": 0.06819679789144961, + "language_loss": 0.85130954, + "learning_rate": 0.0004551980763271005, + "loss": 0.86261046, + "num_input_tokens_seen": 235460976, + "router_z_loss_mlp": 0.15917969, + "step": 2821, + "time_per_iteration": 2.655139923095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125407, + "balance_loss_mlp": 1.10927796, + "epoch": 0.5429011158137745, + "flos": 678454880256.0, + "grad_norm": 0.0864844698510893, + "language_loss": 0.83889675, + "learning_rate": 0.0004548877959176756, + "loss": 0.85015082, + "num_input_tokens_seen": 235540912, + "router_z_loss_mlp": 0.16125488, + "step": 2822, + "time_per_iteration": 2.8853647708892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118965, + "balance_loss_mlp": 1.10281217, + "epoch": 0.5430934974990381, + "flos": 540924065280.0, + "grad_norm": 0.08050409404863457, + "language_loss": 0.8577252, + "learning_rate": 0.00045457753302236166, + "loss": 0.86891484, + "num_input_tokens_seen": 235608736, + "router_z_loss_mlp": 0.16149902, + "step": 2823, + "time_per_iteration": 2.6340198516845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098758, + "balance_loss_mlp": 1.08265328, + "epoch": 0.5432858791843016, + "flos": 658468486656.0, + "grad_norm": 0.09623202069762404, + "language_loss": 0.86938739, + "learning_rate": 0.00045426728776161353, + "loss": 0.88037497, + "num_input_tokens_seen": 235678720, + "router_z_loss_mlp": 0.16101074, + "step": 2824, + "time_per_iteration": 2.792646646499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093887, + "balance_loss_mlp": 1.07741261, + "epoch": 0.5434782608695652, + "flos": 531935216640.0, + "grad_norm": 0.09943652396187513, + "language_loss": 0.81526875, + "learning_rate": 0.00045395706025587863, + "loss": 0.82620764, + "num_input_tokens_seen": 235748704, + "router_z_loss_mlp": 0.16479492, + "step": 2825, + "time_per_iteration": 2.6433639526367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086855, + "balance_loss_mlp": 1.07033277, + "epoch": 0.5436706425548288, + "flos": 608501030400.0, + "grad_norm": 0.0973793187026711, + "language_loss": 0.82506776, + "learning_rate": 0.00045364685062559843, + "loss": 0.83593631, + "num_input_tokens_seen": 235828224, + "router_z_loss_mlp": 0.1652832, + "step": 2826, + "time_per_iteration": 2.8686280250549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_mlp": 1.06635737, + "epoch": 0.5438630242400924, + "flos": 705418854912.0, + "grad_norm": 0.08127433233154835, + "language_loss": 0.91488934, + "learning_rate": 0.0004533366589912067, + "loss": 0.92571723, + "num_input_tokens_seen": 235909392, + "router_z_loss_mlp": 0.16442871, + "step": 2827, + "time_per_iteration": 2.9782917499542236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080524, + "balance_loss_mlp": 1.06361961, + "epoch": 0.544055405925356, + "flos": 856425788928.0, + "grad_norm": 0.0854569540023736, + "language_loss": 0.77591085, + "learning_rate": 0.0004530264854731306, + "loss": 0.7867161, + "num_input_tokens_seen": 235983888, + "router_z_loss_mlp": 0.16918945, + "step": 2828, + "time_per_iteration": 3.036414623260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088214, + "balance_loss_mlp": 1.07106018, + "epoch": 0.5442477876106194, + "flos": 571779523584.0, + "grad_norm": 0.06060788976216288, + "language_loss": 0.83699155, + "learning_rate": 0.00045271633019179034, + "loss": 0.84787375, + "num_input_tokens_seen": 236063056, + "router_z_loss_mlp": 0.17163086, + "step": 2829, + "time_per_iteration": 2.7964255809783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085625, + "balance_loss_mlp": 1.06869721, + "epoch": 0.544440169295883, + "flos": 625556971008.0, + "grad_norm": 0.07110421348748326, + "language_loss": 0.87746441, + "learning_rate": 0.0004524061932675986, + "loss": 0.88832062, + "num_input_tokens_seen": 236141104, + "router_z_loss_mlp": 0.16943359, + "step": 2830, + "time_per_iteration": 2.8379290103912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108832, + "balance_loss_mlp": 1.07154715, + "epoch": 0.5446325509811466, + "flos": 836244103680.0, + "grad_norm": 0.09242408982484117, + "language_loss": 0.86632991, + "learning_rate": 0.00045209607482096125, + "loss": 0.87721312, + "num_input_tokens_seen": 236220320, + "router_z_loss_mlp": 0.16784668, + "step": 2831, + "time_per_iteration": 3.018829345703125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.06516385, + "epoch": 0.5448249326664102, + "flos": 483381043200.0, + "grad_norm": 0.07061707018893328, + "language_loss": 0.84004849, + "learning_rate": 0.0004517859749722772, + "loss": 0.85087609, + "num_input_tokens_seen": 236288208, + "router_z_loss_mlp": 0.17614746, + "step": 2832, + "time_per_iteration": 2.6852874755859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080603, + "balance_loss_mlp": 1.06297243, + "epoch": 0.5450173143516738, + "flos": 561107948544.0, + "grad_norm": 0.0761986265844091, + "language_loss": 0.79247868, + "learning_rate": 0.0004514758938419376, + "loss": 0.8032847, + "num_input_tokens_seen": 236366864, + "router_z_loss_mlp": 0.17663574, + "step": 2833, + "time_per_iteration": 2.8408279418945312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01041827, + "balance_loss_mlp": 1.03262424, + "epoch": 0.5452096960369373, + "flos": 1470420988416.0, + "grad_norm": 0.03242070177943237, + "language_loss": 0.76920587, + "learning_rate": 0.0004511658315503268, + "loss": 0.77962416, + "num_input_tokens_seen": 236597120, + "router_z_loss_mlp": 0.09179688, + "step": 2834, + "time_per_iteration": 4.971372842788696 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079743, + "balance_loss_mlp": 1.06190884, + "epoch": 0.5454020777222008, + "flos": 465064892928.0, + "grad_norm": 0.12322372516304661, + "language_loss": 0.83831322, + "learning_rate": 0.00045085578821782175, + "loss": 0.84911072, + "num_input_tokens_seen": 236664192, + "router_z_loss_mlp": 0.1784668, + "step": 2835, + "time_per_iteration": 2.568789482116699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021066, + "balance_loss_mlp": 1.01186323, + "epoch": 0.5455944594074644, + "flos": 1469657820672.0, + "grad_norm": 0.019977782676812977, + "language_loss": 0.76134741, + "learning_rate": 0.0004505457639647917, + "loss": 0.77155805, + "num_input_tokens_seen": 236888784, + "router_z_loss_mlp": 0.09179688, + "step": 2836, + "time_per_iteration": 4.917972803115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_mlp": 1.05981982, + "epoch": 0.545786841092728, + "flos": 533180371968.0, + "grad_norm": 0.07873848801353439, + "language_loss": 0.809609, + "learning_rate": 0.00045023575891159866, + "loss": 0.82039082, + "num_input_tokens_seen": 236962528, + "router_z_loss_mlp": 0.18347168, + "step": 2837, + "time_per_iteration": 2.723172187805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005469, + "balance_loss_mlp": 0.99645638, + "epoch": 0.5459792227779915, + "flos": 1352389810176.0, + "grad_norm": 0.008784480510471485, + "language_loss": 0.74763811, + "learning_rate": 0.00044992577317859764, + "loss": 0.75769281, + "num_input_tokens_seen": 237179360, + "router_z_loss_mlp": 0.09033203, + "step": 2838, + "time_per_iteration": 4.9626312255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108476, + "balance_loss_mlp": 1.06662869, + "epoch": 0.5461716044632551, + "flos": 637881537024.0, + "grad_norm": 0.06459027340027895, + "language_loss": 0.77977401, + "learning_rate": 0.0004496158068861354, + "loss": 0.79062164, + "num_input_tokens_seen": 237256240, + "router_z_loss_mlp": 0.18139648, + "step": 2839, + "time_per_iteration": 2.8617422580718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089506, + "balance_loss_mlp": 1.0716958, + "epoch": 0.5463639861485187, + "flos": 602751352320.0, + "grad_norm": 0.06807598587278012, + "language_loss": 0.8025732, + "learning_rate": 0.00044930586015455207, + "loss": 0.81346834, + "num_input_tokens_seen": 237334272, + "router_z_loss_mlp": 0.17810059, + "step": 2840, + "time_per_iteration": 2.808669328689575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083083, + "balance_loss_mlp": 1.06519008, + "epoch": 0.5465563678337823, + "flos": 642516738048.0, + "grad_norm": 0.07651604144285383, + "language_loss": 0.88620353, + "learning_rate": 0.000448995933104179, + "loss": 0.89703441, + "num_input_tokens_seen": 237415408, + "router_z_loss_mlp": 0.17907715, + "step": 2841, + "time_per_iteration": 2.877012252807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091203, + "balance_loss_mlp": 1.07347631, + "epoch": 0.5467487495190458, + "flos": 614154161664.0, + "grad_norm": 0.06436857909350054, + "language_loss": 0.79967082, + "learning_rate": 0.00044868602585534077, + "loss": 0.81058288, + "num_input_tokens_seen": 237493232, + "router_z_loss_mlp": 0.17749023, + "step": 2842, + "time_per_iteration": 2.8602800369262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_mlp": 1.06872416, + "epoch": 0.5469411312043093, + "flos": 461190661632.0, + "grad_norm": 0.07724706520419639, + "language_loss": 0.88682342, + "learning_rate": 0.0004483761385283541, + "loss": 0.89768517, + "num_input_tokens_seen": 237556624, + "router_z_loss_mlp": 0.17468262, + "step": 2843, + "time_per_iteration": 2.613612413406372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083541, + "balance_loss_mlp": 1.06613624, + "epoch": 0.5471335128895729, + "flos": 561197154816.0, + "grad_norm": 0.07006219963607276, + "language_loss": 0.81547797, + "learning_rate": 0.0004480662712435281, + "loss": 0.82631338, + "num_input_tokens_seen": 237632048, + "router_z_loss_mlp": 0.17419434, + "step": 2844, + "time_per_iteration": 2.754683256149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084106, + "balance_loss_mlp": 1.0670594, + "epoch": 0.5473258945748365, + "flos": 518686695936.0, + "grad_norm": 0.0733295738661856, + "language_loss": 0.88330519, + "learning_rate": 0.0004477564241211635, + "loss": 0.89414632, + "num_input_tokens_seen": 237699840, + "router_z_loss_mlp": 0.1706543, + "step": 2845, + "time_per_iteration": 2.6289172172546387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_mlp": 1.06219196, + "epoch": 0.5475182762601001, + "flos": 433828763136.0, + "grad_norm": 0.07864053458548881, + "language_loss": 0.8673318, + "learning_rate": 0.0004474465972815541, + "loss": 0.87812233, + "num_input_tokens_seen": 237762560, + "router_z_loss_mlp": 0.16870117, + "step": 2846, + "time_per_iteration": 2.560227870941162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082496, + "balance_loss_mlp": 1.06498456, + "epoch": 0.5477106579453636, + "flos": 511560811008.0, + "grad_norm": 0.07175771823025028, + "language_loss": 0.87547499, + "learning_rate": 0.000447136790844985, + "loss": 0.88629997, + "num_input_tokens_seen": 237837152, + "router_z_loss_mlp": 0.17529297, + "step": 2847, + "time_per_iteration": 2.677354574203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084037, + "balance_loss_mlp": 1.0662632, + "epoch": 0.5479030396306271, + "flos": 675912439296.0, + "grad_norm": 0.07349860951266184, + "language_loss": 0.80877674, + "learning_rate": 0.00044682700493173385, + "loss": 0.81961715, + "num_input_tokens_seen": 237909488, + "router_z_loss_mlp": 0.17785645, + "step": 2848, + "time_per_iteration": 2.8295233249664307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085345, + "balance_loss_mlp": 1.06835747, + "epoch": 0.5480954213158907, + "flos": 876090981888.0, + "grad_norm": 0.14023883156705388, + "language_loss": 0.80396128, + "learning_rate": 0.00044651723966207004, + "loss": 0.81481469, + "num_input_tokens_seen": 237991056, + "router_z_loss_mlp": 0.17004395, + "step": 2849, + "time_per_iteration": 3.1462562084198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108891, + "balance_loss_mlp": 1.07174444, + "epoch": 0.5482878030011543, + "flos": 622006511616.0, + "grad_norm": 0.07606363506125788, + "language_loss": 0.78336805, + "learning_rate": 0.00044620749515625536, + "loss": 0.79425722, + "num_input_tokens_seen": 238064576, + "router_z_loss_mlp": 0.17163086, + "step": 2850, + "time_per_iteration": 2.7834317684173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010911, + "balance_loss_mlp": 1.07376719, + "epoch": 0.5484801846864179, + "flos": 497207725056.0, + "grad_norm": 0.06852456667367239, + "language_loss": 0.84954178, + "learning_rate": 0.00044589777153454334, + "loss": 0.86045277, + "num_input_tokens_seen": 238136464, + "router_z_loss_mlp": 0.17346191, + "step": 2851, + "time_per_iteration": 2.760814666748047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093157, + "balance_loss_mlp": 1.076015, + "epoch": 0.5486725663716814, + "flos": 442432171008.0, + "grad_norm": 0.07096393350950583, + "language_loss": 0.83673847, + "learning_rate": 0.00044558806891717895, + "loss": 0.84767002, + "num_input_tokens_seen": 238198912, + "router_z_loss_mlp": 0.17163086, + "step": 2852, + "time_per_iteration": 2.5164217948913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100792, + "balance_loss_mlp": 1.08369744, + "epoch": 0.548864948056945, + "flos": 655162504704.0, + "grad_norm": 0.07126320694951607, + "language_loss": 0.79487526, + "learning_rate": 0.0004452783874243998, + "loss": 0.80588323, + "num_input_tokens_seen": 238275184, + "router_z_loss_mlp": 0.17102051, + "step": 2853, + "time_per_iteration": 2.8530960083007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103909, + "balance_loss_mlp": 1.08725584, + "epoch": 0.5490573297422086, + "flos": 546036111360.0, + "grad_norm": 0.08398495342430926, + "language_loss": 0.84832799, + "learning_rate": 0.00044496872717643475, + "loss": 0.85936707, + "num_input_tokens_seen": 238348496, + "router_z_loss_mlp": 0.16662598, + "step": 2854, + "time_per_iteration": 2.7308356761932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148414, + "balance_loss_mlp": 1.13902032, + "epoch": 0.5492497114274721, + "flos": 1590309987840.0, + "grad_norm": 0.045162076754917825, + "language_loss": 0.77089292, + "learning_rate": 0.00044465908829350453, + "loss": 0.78237706, + "num_input_tokens_seen": 238578464, + "router_z_loss_mlp": 0.09375, + "step": 2855, + "time_per_iteration": 4.96257209777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110869, + "balance_loss_mlp": 1.0924654, + "epoch": 0.5494420931127356, + "flos": 750906754560.0, + "grad_norm": 0.08468526373475738, + "language_loss": 0.81551182, + "learning_rate": 0.0004443494708958217, + "loss": 0.8265987, + "num_input_tokens_seen": 238660256, + "router_z_loss_mlp": 0.16223145, + "step": 2856, + "time_per_iteration": 3.0704264640808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101313, + "balance_loss_mlp": 1.08494544, + "epoch": 0.5496344747979992, + "flos": 626023904256.0, + "grad_norm": 0.07044544020385766, + "language_loss": 0.8094157, + "learning_rate": 0.0004440398751035906, + "loss": 0.82042885, + "num_input_tokens_seen": 238745856, + "router_z_loss_mlp": 0.16369629, + "step": 2857, + "time_per_iteration": 2.971601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089993, + "balance_loss_mlp": 1.07342279, + "epoch": 0.5498268564832628, + "flos": 523111924224.0, + "grad_norm": 0.09537197244188163, + "language_loss": 0.83738565, + "learning_rate": 0.00044373030103700645, + "loss": 0.84828568, + "num_input_tokens_seen": 238813888, + "router_z_loss_mlp": 0.16577148, + "step": 2858, + "time_per_iteration": 2.6193714141845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082176, + "balance_loss_mlp": 1.06564164, + "epoch": 0.5500192381685264, + "flos": 604587151872.0, + "grad_norm": 0.080765091719421, + "language_loss": 0.79399335, + "learning_rate": 0.000443420748816257, + "loss": 0.80481505, + "num_input_tokens_seen": 238885440, + "router_z_loss_mlp": 0.16540527, + "step": 2859, + "time_per_iteration": 2.8064911365509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080106, + "balance_loss_mlp": 1.06258249, + "epoch": 0.55021161985379, + "flos": 520527264768.0, + "grad_norm": 0.073148777328263, + "language_loss": 0.78411651, + "learning_rate": 0.0004431112185615208, + "loss": 0.79491758, + "num_input_tokens_seen": 238960944, + "router_z_loss_mlp": 0.17541504, + "step": 2860, + "time_per_iteration": 2.8055756092071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075442, + "balance_loss_mlp": 1.05794191, + "epoch": 0.5504040015390534, + "flos": 489671806464.0, + "grad_norm": 0.07383159181316334, + "language_loss": 0.80081785, + "learning_rate": 0.00044280171039296845, + "loss": 0.81157225, + "num_input_tokens_seen": 239030592, + "router_z_loss_mlp": 0.17504883, + "step": 2861, + "time_per_iteration": 2.643036127090454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107612, + "balance_loss_mlp": 1.05894184, + "epoch": 0.550596383224317, + "flos": 575787377664.0, + "grad_norm": 0.07661018407476591, + "language_loss": 0.88472402, + "learning_rate": 0.0004424922244307616, + "loss": 0.89548522, + "num_input_tokens_seen": 239097440, + "router_z_loss_mlp": 0.171875, + "step": 2862, + "time_per_iteration": 2.735457181930542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071011, + "balance_loss_mlp": 1.05303383, + "epoch": 0.5507887649095806, + "flos": 642445157376.0, + "grad_norm": 0.07542764443639904, + "language_loss": 0.82038581, + "learning_rate": 0.00044218276079505315, + "loss": 0.83109593, + "num_input_tokens_seen": 239179872, + "router_z_loss_mlp": 0.17980957, + "step": 2863, + "time_per_iteration": 2.8912277221679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074407, + "balance_loss_mlp": 1.05706251, + "epoch": 0.5509811465948442, + "flos": 531843812352.0, + "grad_norm": 0.07733612279333801, + "language_loss": 0.74451876, + "learning_rate": 0.0004418733196059876, + "loss": 0.75526285, + "num_input_tokens_seen": 239251264, + "router_z_loss_mlp": 0.17358398, + "step": 2864, + "time_per_iteration": 2.7250518798828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072289, + "balance_loss_mlp": 1.0549556, + "epoch": 0.5511735282801077, + "flos": 654747328512.0, + "grad_norm": 0.07639087544106095, + "language_loss": 0.79757476, + "learning_rate": 0.0004415639009837008, + "loss": 0.80829769, + "num_input_tokens_seen": 239326688, + "router_z_loss_mlp": 0.17358398, + "step": 2865, + "time_per_iteration": 2.864443302154541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080254, + "balance_loss_mlp": 1.06293249, + "epoch": 0.5513659099653713, + "flos": 529498861056.0, + "grad_norm": 0.10225669356006223, + "language_loss": 0.81241995, + "learning_rate": 0.00044125450504831955, + "loss": 0.82322252, + "num_input_tokens_seen": 239401248, + "router_z_loss_mlp": 0.17346191, + "step": 2866, + "time_per_iteration": 2.757418394088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106921, + "balance_loss_mlp": 1.05211556, + "epoch": 0.5515582916506349, + "flos": 554869315584.0, + "grad_norm": 0.07466053084799135, + "language_loss": 0.82329029, + "learning_rate": 0.0004409451319199622, + "loss": 0.83398235, + "num_input_tokens_seen": 239471600, + "router_z_loss_mlp": 0.17102051, + "step": 2867, + "time_per_iteration": 2.6991469860076904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076691, + "balance_loss_mlp": 1.05928612, + "epoch": 0.5517506733358984, + "flos": 735407258112.0, + "grad_norm": 0.07186936074556817, + "language_loss": 0.84288383, + "learning_rate": 0.0004406357817187381, + "loss": 0.85365069, + "num_input_tokens_seen": 239548592, + "router_z_loss_mlp": 0.17419434, + "step": 2868, + "time_per_iteration": 3.0115489959716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080563, + "balance_loss_mlp": 1.06333685, + "epoch": 0.551943055021162, + "flos": 1115325697536.0, + "grad_norm": 0.0781084398751081, + "language_loss": 0.81316972, + "learning_rate": 0.0004403264545647474, + "loss": 0.82397532, + "num_input_tokens_seen": 239644432, + "router_z_loss_mlp": 0.17224121, + "step": 2869, + "time_per_iteration": 3.5515377521514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076801, + "balance_loss_mlp": 1.05957544, + "epoch": 0.5521354367064255, + "flos": 544373208576.0, + "grad_norm": 0.2476039521732135, + "language_loss": 0.84535432, + "learning_rate": 0.00044001715057808154, + "loss": 0.85612237, + "num_input_tokens_seen": 239723392, + "router_z_loss_mlp": 0.17236328, + "step": 2870, + "time_per_iteration": 2.784949541091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081188, + "balance_loss_mlp": 1.06391478, + "epoch": 0.5523278183916891, + "flos": 936285101568.0, + "grad_norm": 0.06269874774360217, + "language_loss": 0.81665605, + "learning_rate": 0.0004397078698788232, + "loss": 0.82746798, + "num_input_tokens_seen": 239806896, + "router_z_loss_mlp": 0.17285156, + "step": 2871, + "time_per_iteration": 3.2355031967163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_mlp": 1.02401352, + "epoch": 0.5525202000769527, + "flos": 1465911696384.0, + "grad_norm": 0.01828848292268018, + "language_loss": 0.80442369, + "learning_rate": 0.0004393986125870456, + "loss": 0.81475484, + "num_input_tokens_seen": 240037824, + "router_z_loss_mlp": 0.09130859, + "step": 2872, + "time_per_iteration": 4.935345888137817 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102138, + "balance_loss_mlp": 1.08499527, + "epoch": 0.5527125817622163, + "flos": 489800286720.0, + "grad_norm": 0.07166089349392388, + "language_loss": 0.77967119, + "learning_rate": 0.00043908937882281343, + "loss": 0.79069257, + "num_input_tokens_seen": 240107952, + "router_z_loss_mlp": 0.17150879, + "step": 2873, + "time_per_iteration": 2.6478757858276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109525, + "balance_loss_mlp": 1.0917629, + "epoch": 0.5529049634474797, + "flos": 634914008064.0, + "grad_norm": 0.0876696984943119, + "language_loss": 0.8235116, + "learning_rate": 0.0004387801687061814, + "loss": 0.83460689, + "num_input_tokens_seen": 240183824, + "router_z_loss_mlp": 0.17773438, + "step": 2874, + "time_per_iteration": 2.8796098232269287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117288, + "balance_loss_mlp": 1.09996676, + "epoch": 0.5530973451327433, + "flos": 581274952704.0, + "grad_norm": 0.10934470386726207, + "language_loss": 0.80325609, + "learning_rate": 0.0004384709823571958, + "loss": 0.81442899, + "num_input_tokens_seen": 240259296, + "router_z_loss_mlp": 0.17321777, + "step": 2875, + "time_per_iteration": 2.7749507427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116347, + "balance_loss_mlp": 1.09927666, + "epoch": 0.5532897268180069, + "flos": 1122488658432.0, + "grad_norm": 0.09489557943610515, + "language_loss": 0.82828677, + "learning_rate": 0.0004381618198958932, + "loss": 0.83945024, + "num_input_tokens_seen": 240346768, + "router_z_loss_mlp": 0.17089844, + "step": 2876, + "time_per_iteration": 3.550828218460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113394, + "balance_loss_mlp": 1.09662116, + "epoch": 0.5534821085032705, + "flos": 637273640448.0, + "grad_norm": 0.0896519056563172, + "language_loss": 0.83453453, + "learning_rate": 0.00043785268144230137, + "loss": 0.84566844, + "num_input_tokens_seen": 240429344, + "router_z_loss_mlp": 0.16784668, + "step": 2877, + "time_per_iteration": 2.934293270111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100462, + "balance_loss_mlp": 1.08360553, + "epoch": 0.5536744901885341, + "flos": 571112529408.0, + "grad_norm": 0.09194081720705921, + "language_loss": 0.8212803, + "learning_rate": 0.00043754356711643837, + "loss": 0.83228499, + "num_input_tokens_seen": 240497008, + "router_z_loss_mlp": 0.16870117, + "step": 2878, + "time_per_iteration": 2.7139456272125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100534, + "balance_loss_mlp": 1.08367825, + "epoch": 0.5538668718737976, + "flos": 595716871680.0, + "grad_norm": 0.06610172637947556, + "language_loss": 0.83962673, + "learning_rate": 0.0004372344770383132, + "loss": 0.85063207, + "num_input_tokens_seen": 240578432, + "router_z_loss_mlp": 0.16870117, + "step": 2879, + "time_per_iteration": 2.848620891571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093635, + "balance_loss_mlp": 1.07679105, + "epoch": 0.5540592535590612, + "flos": 532602210816.0, + "grad_norm": 0.058036155609321634, + "language_loss": 0.82615423, + "learning_rate": 0.00043692541132792507, + "loss": 0.83709061, + "num_input_tokens_seen": 240649136, + "router_z_loss_mlp": 0.1685791, + "step": 2880, + "time_per_iteration": 2.713151693344116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091805, + "balance_loss_mlp": 1.07453132, + "epoch": 0.5542516352443247, + "flos": 412619235840.0, + "grad_norm": 0.07516039196528058, + "language_loss": 0.83473843, + "learning_rate": 0.00043661637010526384, + "loss": 0.84565651, + "num_input_tokens_seen": 240714240, + "router_z_loss_mlp": 0.17285156, + "step": 2881, + "time_per_iteration": 2.500458240509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109005, + "balance_loss_mlp": 1.07309878, + "epoch": 0.5544440169295883, + "flos": 547607609856.0, + "grad_norm": 0.06896643795770978, + "language_loss": 0.83134168, + "learning_rate": 0.00043630735349031025, + "loss": 0.84224218, + "num_input_tokens_seen": 240786928, + "router_z_loss_mlp": 0.16967773, + "step": 2882, + "time_per_iteration": 2.7521300315856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089845, + "balance_loss_mlp": 1.07317972, + "epoch": 0.5546363986148518, + "flos": 621821131776.0, + "grad_norm": 0.0736705000466592, + "language_loss": 0.81719375, + "learning_rate": 0.00043599836160303495, + "loss": 0.82809222, + "num_input_tokens_seen": 240865328, + "router_z_loss_mlp": 0.16674805, + "step": 2883, + "time_per_iteration": 2.927696704864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092625, + "balance_loss_mlp": 1.07550669, + "epoch": 0.5548287803001154, + "flos": 705292945920.0, + "grad_norm": 0.07830589066561539, + "language_loss": 0.77380168, + "learning_rate": 0.0004356893945633995, + "loss": 0.78472787, + "num_input_tokens_seen": 240945680, + "router_z_loss_mlp": 0.17126465, + "step": 2884, + "time_per_iteration": 2.9854161739349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095707, + "balance_loss_mlp": 1.07886314, + "epoch": 0.555021161985379, + "flos": 504197789184.0, + "grad_norm": 0.06846026312584631, + "language_loss": 0.81705189, + "learning_rate": 0.0004353804524913551, + "loss": 0.82800889, + "num_input_tokens_seen": 241010800, + "router_z_loss_mlp": 0.1685791, + "step": 2885, + "time_per_iteration": 2.6230812072753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109452, + "balance_loss_mlp": 1.07769918, + "epoch": 0.5552135436706426, + "flos": 616066684416.0, + "grad_norm": 0.07648898628472602, + "language_loss": 0.81513786, + "learning_rate": 0.0004350715355068441, + "loss": 0.82608306, + "num_input_tokens_seen": 241085328, + "router_z_loss_mlp": 0.16821289, + "step": 2886, + "time_per_iteration": 2.7672505378723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088661, + "balance_loss_mlp": 1.07191217, + "epoch": 0.5554059253559062, + "flos": 463871494656.0, + "grad_norm": 0.09976401172783889, + "language_loss": 0.79409927, + "learning_rate": 0.00043476264372979847, + "loss": 0.80498588, + "num_input_tokens_seen": 241149600, + "router_z_loss_mlp": 0.16760254, + "step": 2887, + "time_per_iteration": 2.5482900142669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108678, + "balance_loss_mlp": 1.07004309, + "epoch": 0.5555983070411696, + "flos": 1562512384512.0, + "grad_norm": 0.07823105816490118, + "language_loss": 0.78681719, + "learning_rate": 0.0004344537772801408, + "loss": 0.79768503, + "num_input_tokens_seen": 241244832, + "router_z_loss_mlp": 0.16748047, + "step": 2888, + "time_per_iteration": 3.8460328578948975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021438, + "balance_loss_mlp": 1.01290298, + "epoch": 0.5557906887264332, + "flos": 1467917821440.0, + "grad_norm": 0.01755933384686064, + "language_loss": 0.73422456, + "learning_rate": 0.0004341449362777836, + "loss": 0.74443889, + "num_input_tokens_seen": 241479728, + "router_z_loss_mlp": 0.08544922, + "step": 2889, + "time_per_iteration": 4.991191625595093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090362, + "balance_loss_mlp": 1.07311237, + "epoch": 0.5559830704116968, + "flos": 529832544768.0, + "grad_norm": 0.07150457401269486, + "language_loss": 0.83297288, + "learning_rate": 0.0004338361208426298, + "loss": 0.84387648, + "num_input_tokens_seen": 241545616, + "router_z_loss_mlp": 0.17272949, + "step": 2890, + "time_per_iteration": 2.6730411052703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108659, + "balance_loss_mlp": 1.06942344, + "epoch": 0.5561754520969604, + "flos": 651218890752.0, + "grad_norm": 0.07268648775014128, + "language_loss": 0.81282032, + "learning_rate": 0.00043352733109457164, + "loss": 0.82368624, + "num_input_tokens_seen": 241629040, + "router_z_loss_mlp": 0.17175293, + "step": 2891, + "time_per_iteration": 2.9306631088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094106, + "balance_loss_mlp": 1.07713079, + "epoch": 0.556367833782224, + "flos": 734297923584.0, + "grad_norm": 0.057117910972540105, + "language_loss": 0.8439607, + "learning_rate": 0.00043321856715349244, + "loss": 0.85490179, + "num_input_tokens_seen": 241706272, + "router_z_loss_mlp": 0.1697998, + "step": 2892, + "time_per_iteration": 2.9671812057495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089069, + "balance_loss_mlp": 1.07197452, + "epoch": 0.5565602154674875, + "flos": 672423648768.0, + "grad_norm": 0.07676329529256688, + "language_loss": 0.80519265, + "learning_rate": 0.00043290982913926466, + "loss": 0.81608331, + "num_input_tokens_seen": 241782304, + "router_z_loss_mlp": 0.17089844, + "step": 2893, + "time_per_iteration": 2.853346347808838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095237, + "balance_loss_mlp": 1.07807112, + "epoch": 0.556752597152751, + "flos": 586228783104.0, + "grad_norm": 0.07854184605893377, + "language_loss": 0.84350514, + "learning_rate": 0.0004326011171717514, + "loss": 0.8544575, + "num_input_tokens_seen": 241868576, + "router_z_loss_mlp": 0.171875, + "step": 2894, + "time_per_iteration": 2.899630546569824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090371, + "balance_loss_mlp": 1.07324028, + "epoch": 0.5569449788380146, + "flos": 437777146368.0, + "grad_norm": 0.0742839839754536, + "language_loss": 0.80647063, + "learning_rate": 0.0004322924313708051, + "loss": 0.81737435, + "num_input_tokens_seen": 241933696, + "router_z_loss_mlp": 0.17138672, + "step": 2895, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094314, + "balance_loss_mlp": 1.07758927, + "epoch": 0.5571373605232782, + "flos": 502250761728.0, + "grad_norm": 0.09937187753239417, + "language_loss": 0.8452369, + "learning_rate": 0.0004319837718562681, + "loss": 0.85618007, + "num_input_tokens_seen": 242003056, + "router_z_loss_mlp": 0.1673584, + "step": 2896, + "time_per_iteration": 2.655710220336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079176, + "balance_loss_mlp": 1.06149721, + "epoch": 0.5573297422085417, + "flos": 577417973760.0, + "grad_norm": 0.08562998531843592, + "language_loss": 0.83042324, + "learning_rate": 0.0004316751387479726, + "loss": 0.84121501, + "num_input_tokens_seen": 242076368, + "router_z_loss_mlp": 0.17700195, + "step": 2897, + "time_per_iteration": 2.7913060188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087861, + "balance_loss_mlp": 1.07069528, + "epoch": 0.5575221238938053, + "flos": 1344037515264.0, + "grad_norm": 0.0783746969742657, + "language_loss": 0.82070696, + "learning_rate": 0.0004313665321657409, + "loss": 0.83158553, + "num_input_tokens_seen": 242161600, + "router_z_loss_mlp": 0.17175293, + "step": 2898, + "time_per_iteration": 3.726264476776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086135, + "balance_loss_mlp": 1.06881404, + "epoch": 0.5577145055790689, + "flos": 601963218432.0, + "grad_norm": 0.0851867501114316, + "language_loss": 0.79751718, + "learning_rate": 0.00043105795222938436, + "loss": 0.80837852, + "num_input_tokens_seen": 242237904, + "router_z_loss_mlp": 0.17346191, + "step": 2899, + "time_per_iteration": 2.7452197074890137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079222, + "balance_loss_mlp": 1.06218684, + "epoch": 0.5579068872643325, + "flos": 562620349440.0, + "grad_norm": 0.07553101492130006, + "language_loss": 0.78055334, + "learning_rate": 0.00043074939905870467, + "loss": 0.7913456, + "num_input_tokens_seen": 242306736, + "router_z_loss_mlp": 0.17053223, + "step": 2900, + "time_per_iteration": 2.6780247688293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107635, + "balance_loss_mlp": 1.05935049, + "epoch": 0.558099268949596, + "flos": 544551247872.0, + "grad_norm": 0.07503151839740589, + "language_loss": 0.80663788, + "learning_rate": 0.0004304408727734927, + "loss": 0.81740135, + "num_input_tokens_seen": 242376000, + "router_z_loss_mlp": 0.17016602, + "step": 2901, + "time_per_iteration": 2.7029857635498047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073519, + "balance_loss_mlp": 1.05609071, + "epoch": 0.5582916506348595, + "flos": 552786467328.0, + "grad_norm": 0.07321045917693372, + "language_loss": 0.88611877, + "learning_rate": 0.0004301323734935288, + "loss": 0.89685392, + "num_input_tokens_seen": 242447056, + "router_z_loss_mlp": 0.17443848, + "step": 2902, + "time_per_iteration": 2.679443597793579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107071, + "balance_loss_mlp": 1.05356789, + "epoch": 0.5584840323201231, + "flos": 543385013760.0, + "grad_norm": 0.07694594545228804, + "language_loss": 0.8710258, + "learning_rate": 0.000429823901338583, + "loss": 0.88173282, + "num_input_tokens_seen": 242514400, + "router_z_loss_mlp": 0.17150879, + "step": 2903, + "time_per_iteration": 2.627321720123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069288, + "balance_loss_mlp": 1.05181181, + "epoch": 0.5586764140053867, + "flos": 815573090304.0, + "grad_norm": 0.06625834371738154, + "language_loss": 0.8649714, + "learning_rate": 0.00042951545642841513, + "loss": 0.87566429, + "num_input_tokens_seen": 242601616, + "router_z_loss_mlp": 0.17492676, + "step": 2904, + "time_per_iteration": 3.0950725078582764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079393, + "balance_loss_mlp": 1.06204844, + "epoch": 0.5588687956906503, + "flos": 486439976448.0, + "grad_norm": 0.06552893866180562, + "language_loss": 0.86677754, + "learning_rate": 0.0004292070388827737, + "loss": 0.87757146, + "num_input_tokens_seen": 242669648, + "router_z_loss_mlp": 0.17358398, + "step": 2905, + "time_per_iteration": 2.6045844554901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079778, + "balance_loss_mlp": 1.0621829, + "epoch": 0.5590611773759138, + "flos": 452060849664.0, + "grad_norm": 0.06250610211350227, + "language_loss": 0.81015515, + "learning_rate": 0.00042889864882139753, + "loss": 0.82095295, + "num_input_tokens_seen": 242737456, + "router_z_loss_mlp": 0.17602539, + "step": 2906, + "time_per_iteration": 2.5961766242980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089486, + "balance_loss_mlp": 1.07233191, + "epoch": 0.5592535590611774, + "flos": 520945012224.0, + "grad_norm": 0.06934465100856418, + "language_loss": 0.81378168, + "learning_rate": 0.0004285902863640139, + "loss": 0.82467651, + "num_input_tokens_seen": 242807008, + "router_z_loss_mlp": 0.17175293, + "step": 2907, + "time_per_iteration": 2.6232824325561523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085922, + "balance_loss_mlp": 1.06869626, + "epoch": 0.5594459407464409, + "flos": 552519595008.0, + "grad_norm": 0.10268967312822828, + "language_loss": 0.86113304, + "learning_rate": 0.00042828195163033966, + "loss": 0.87199223, + "num_input_tokens_seen": 242877328, + "router_z_loss_mlp": 0.17236328, + "step": 2908, + "time_per_iteration": 2.696558952331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099626, + "balance_loss_mlp": 1.08187604, + "epoch": 0.5596383224317045, + "flos": 484833973248.0, + "grad_norm": 0.07292872799420033, + "language_loss": 0.78787363, + "learning_rate": 0.0004279736447400812, + "loss": 0.79886991, + "num_input_tokens_seen": 242943152, + "router_z_loss_mlp": 0.1776123, + "step": 2909, + "time_per_iteration": 2.5506749153137207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097418, + "balance_loss_mlp": 1.08000195, + "epoch": 0.5598307041169681, + "flos": 611256015360.0, + "grad_norm": 0.08183440800263254, + "language_loss": 0.78410208, + "learning_rate": 0.00042766536581293385, + "loss": 0.79507631, + "num_input_tokens_seen": 243014656, + "router_z_loss_mlp": 0.17431641, + "step": 2910, + "time_per_iteration": 2.762291193008423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107558, + "balance_loss_mlp": 1.09001017, + "epoch": 0.5600230858022316, + "flos": 488851365888.0, + "grad_norm": 0.07156517368649688, + "language_loss": 0.79594785, + "learning_rate": 0.0004273571149685819, + "loss": 0.80702341, + "num_input_tokens_seen": 243089040, + "router_z_loss_mlp": 0.17541504, + "step": 2911, + "time_per_iteration": 2.8065130710601807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106568, + "balance_loss_mlp": 1.08937764, + "epoch": 0.5602154674874952, + "flos": 598869780480.0, + "grad_norm": 0.09303022295818829, + "language_loss": 0.83760977, + "learning_rate": 0.00042704889232669937, + "loss": 0.84867543, + "num_input_tokens_seen": 243162480, + "router_z_loss_mlp": 0.17199707, + "step": 2912, + "time_per_iteration": 2.7454051971435547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107264, + "balance_loss_mlp": 1.09049106, + "epoch": 0.5604078491727588, + "flos": 585969624576.0, + "grad_norm": 0.08686899917243208, + "language_loss": 0.85566956, + "learning_rate": 0.0004267406980069484, + "loss": 0.86674225, + "num_input_tokens_seen": 243232880, + "router_z_loss_mlp": 0.16772461, + "step": 2913, + "time_per_iteration": 2.703652858734131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100261, + "balance_loss_mlp": 1.08297539, + "epoch": 0.5606002308580224, + "flos": 541205618688.0, + "grad_norm": 0.07169329099349257, + "language_loss": 0.79587048, + "learning_rate": 0.0004264325321289808, + "loss": 0.80687308, + "num_input_tokens_seen": 243309168, + "router_z_loss_mlp": 0.17297363, + "step": 2914, + "time_per_iteration": 2.8367066383361816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100801, + "balance_loss_mlp": 1.08408761, + "epoch": 0.5607926125432858, + "flos": 583938533376.0, + "grad_norm": 0.08752271404037346, + "language_loss": 0.85925829, + "learning_rate": 0.00042612439481243736, + "loss": 0.87026626, + "num_input_tokens_seen": 243382064, + "router_z_loss_mlp": 0.16711426, + "step": 2915, + "time_per_iteration": 2.801067590713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102823, + "balance_loss_mlp": 1.08577609, + "epoch": 0.5609849942285494, + "flos": 627489317376.0, + "grad_norm": 0.08075626027224062, + "language_loss": 0.89818108, + "learning_rate": 0.00042581628617694735, + "loss": 0.90920925, + "num_input_tokens_seen": 243452064, + "router_z_loss_mlp": 0.1706543, + "step": 2916, + "time_per_iteration": 2.75644588470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101283, + "balance_loss_mlp": 1.08478427, + "epoch": 0.561177375913813, + "flos": 588366332928.0, + "grad_norm": 0.09688272488525364, + "language_loss": 0.82010305, + "learning_rate": 0.0004255082063421296, + "loss": 0.83111584, + "num_input_tokens_seen": 243525600, + "router_z_loss_mlp": 0.16503906, + "step": 2917, + "time_per_iteration": 2.7048747539520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101411, + "balance_loss_mlp": 1.08411336, + "epoch": 0.5613697575990766, + "flos": 527047824384.0, + "grad_norm": 0.05911652799286667, + "language_loss": 0.84559923, + "learning_rate": 0.00042520015542759065, + "loss": 0.8566134, + "num_input_tokens_seen": 243605536, + "router_z_loss_mlp": 0.17297363, + "step": 2918, + "time_per_iteration": 2.8888731002807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096781, + "balance_loss_mlp": 1.0798173, + "epoch": 0.5615621392843402, + "flos": 642655130112.0, + "grad_norm": 0.0855416495861322, + "language_loss": 0.87984401, + "learning_rate": 0.00042489213355292687, + "loss": 0.8908118, + "num_input_tokens_seen": 243684208, + "router_z_loss_mlp": 0.1697998, + "step": 2919, + "time_per_iteration": 2.9039535522460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099415, + "balance_loss_mlp": 1.08183169, + "epoch": 0.5617545209696037, + "flos": 427750543872.0, + "grad_norm": 0.09901142655299539, + "language_loss": 0.80785292, + "learning_rate": 0.00042458414083772276, + "loss": 0.81884712, + "num_input_tokens_seen": 243749376, + "router_z_loss_mlp": 0.17590332, + "step": 2920, + "time_per_iteration": 2.55914306640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100735, + "balance_loss_mlp": 1.08350968, + "epoch": 0.5619469026548672, + "flos": 568429125120.0, + "grad_norm": 0.058059763768477664, + "language_loss": 0.84851801, + "learning_rate": 0.000424276177401552, + "loss": 0.85952532, + "num_input_tokens_seen": 243828096, + "router_z_loss_mlp": 0.17248535, + "step": 2921, + "time_per_iteration": 2.847381353378296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090657, + "balance_loss_mlp": 1.07289529, + "epoch": 0.5621392843401308, + "flos": 505205807616.0, + "grad_norm": 0.08698061874066902, + "language_loss": 0.85584521, + "learning_rate": 0.0004239682433639763, + "loss": 0.86675179, + "num_input_tokens_seen": 243896752, + "router_z_loss_mlp": 0.17785645, + "step": 2922, + "time_per_iteration": 2.707058906555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095936, + "balance_loss_mlp": 1.07888877, + "epoch": 0.5623316660253944, + "flos": 516996628992.0, + "grad_norm": 0.07977820706870507, + "language_loss": 0.85277724, + "learning_rate": 0.0004236603388445467, + "loss": 0.86373651, + "num_input_tokens_seen": 243964592, + "router_z_loss_mlp": 0.1706543, + "step": 2923, + "time_per_iteration": 2.6301956176757812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090666, + "balance_loss_mlp": 1.07373846, + "epoch": 0.5625240477106579, + "flos": 606012917760.0, + "grad_norm": 0.07720818022124956, + "language_loss": 0.81903416, + "learning_rate": 0.00042335246396280166, + "loss": 0.8299408, + "num_input_tokens_seen": 244036656, + "router_z_loss_mlp": 0.16943359, + "step": 2924, + "time_per_iteration": 2.834073066711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090909, + "balance_loss_mlp": 1.07374263, + "epoch": 0.5627164293959215, + "flos": 450430253568.0, + "grad_norm": 0.07626854299399176, + "language_loss": 0.9026264, + "learning_rate": 0.0004230446188382693, + "loss": 0.91353548, + "num_input_tokens_seen": 244102704, + "router_z_loss_mlp": 0.171875, + "step": 2925, + "time_per_iteration": 2.6027684211730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092312, + "balance_loss_mlp": 1.07481217, + "epoch": 0.5629088110811851, + "flos": 742073550336.0, + "grad_norm": 0.06785040334520868, + "language_loss": 0.80436468, + "learning_rate": 0.0004227368035904654, + "loss": 0.81528783, + "num_input_tokens_seen": 244186640, + "router_z_loss_mlp": 0.17504883, + "step": 2926, + "time_per_iteration": 3.005417585372925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097122, + "balance_loss_mlp": 1.0790019, + "epoch": 0.5631011927664487, + "flos": 496970588160.0, + "grad_norm": 0.06983498391207757, + "language_loss": 0.82735908, + "learning_rate": 0.00042242901833889474, + "loss": 0.83833027, + "num_input_tokens_seen": 244257680, + "router_z_loss_mlp": 0.18139648, + "step": 2927, + "time_per_iteration": 2.6397151947021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090818, + "balance_loss_mlp": 1.07340133, + "epoch": 0.5632935744517122, + "flos": 886137408000.0, + "grad_norm": 0.08127979757153865, + "language_loss": 0.85876542, + "learning_rate": 0.0004221212632030501, + "loss": 0.86967361, + "num_input_tokens_seen": 244331248, + "router_z_loss_mlp": 0.17443848, + "step": 2928, + "time_per_iteration": 3.098761558532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098859, + "balance_loss_mlp": 1.08115637, + "epoch": 0.5634859561369757, + "flos": 604792355328.0, + "grad_norm": 0.07359943981906872, + "language_loss": 0.80209559, + "learning_rate": 0.0004218135383024124, + "loss": 0.81308413, + "num_input_tokens_seen": 244403920, + "router_z_loss_mlp": 0.17724609, + "step": 2929, + "time_per_iteration": 2.7450544834136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087151, + "balance_loss_mlp": 1.06923413, + "epoch": 0.5636783378222393, + "flos": 453916472832.0, + "grad_norm": 0.08357226339131614, + "language_loss": 0.85142308, + "learning_rate": 0.0004215058437564511, + "loss": 0.86229455, + "num_input_tokens_seen": 244470464, + "router_z_loss_mlp": 0.17919922, + "step": 2930, + "time_per_iteration": 2.592543125152588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083944, + "balance_loss_mlp": 1.06644368, + "epoch": 0.5638707195075029, + "flos": 518456899584.0, + "grad_norm": 0.14879002546575693, + "language_loss": 0.82019955, + "learning_rate": 0.00042119817968462397, + "loss": 0.83103901, + "num_input_tokens_seen": 244536864, + "router_z_loss_mlp": 0.17504883, + "step": 2931, + "time_per_iteration": 2.645047187805176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080791, + "balance_loss_mlp": 1.06259942, + "epoch": 0.5640631011927665, + "flos": 564873896448.0, + "grad_norm": 0.08065967807891394, + "language_loss": 0.86642003, + "learning_rate": 0.0004208905462063766, + "loss": 0.87722796, + "num_input_tokens_seen": 244603344, + "router_z_loss_mlp": 0.18200684, + "step": 2932, + "time_per_iteration": 2.6538538932800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108164, + "balance_loss_mlp": 1.06381869, + "epoch": 0.56425548287803, + "flos": 517033704960.0, + "grad_norm": 0.07678540437917139, + "language_loss": 0.84284365, + "learning_rate": 0.00042058294344114315, + "loss": 0.85366011, + "num_input_tokens_seen": 244671984, + "router_z_loss_mlp": 0.17834473, + "step": 2933, + "time_per_iteration": 2.658790349960327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088075, + "balance_loss_mlp": 1.07069397, + "epoch": 0.5644478645632935, + "flos": 854258876928.0, + "grad_norm": 0.06842628935517767, + "language_loss": 0.77464747, + "learning_rate": 0.0004202753715083456, + "loss": 0.78552824, + "num_input_tokens_seen": 244754000, + "router_z_loss_mlp": 0.1739502, + "step": 2934, + "time_per_iteration": 3.0965383052825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084539, + "balance_loss_mlp": 1.06742072, + "epoch": 0.5646402462485571, + "flos": 553438780416.0, + "grad_norm": 0.07525134320826762, + "language_loss": 0.80874884, + "learning_rate": 0.0004199678305273936, + "loss": 0.81959426, + "num_input_tokens_seen": 244820896, + "router_z_loss_mlp": 0.17126465, + "step": 2935, + "time_per_iteration": 2.6553165912628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097701, + "balance_loss_mlp": 1.08022487, + "epoch": 0.5648326279338207, + "flos": 685990798848.0, + "grad_norm": 0.06441901520709055, + "language_loss": 0.81395012, + "learning_rate": 0.0004196603206176854, + "loss": 0.82492715, + "num_input_tokens_seen": 244904464, + "router_z_loss_mlp": 0.17492676, + "step": 2936, + "time_per_iteration": 2.983830213546753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087212, + "balance_loss_mlp": 1.07004595, + "epoch": 0.5650250096190843, + "flos": 803327818752.0, + "grad_norm": 0.07452375479830534, + "language_loss": 0.83586991, + "learning_rate": 0.000419352841898607, + "loss": 0.84674203, + "num_input_tokens_seen": 244983760, + "router_z_loss_mlp": 0.171875, + "step": 2937, + "time_per_iteration": 3.003563404083252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089681, + "balance_loss_mlp": 1.07318234, + "epoch": 0.5652173913043478, + "flos": 582058317312.0, + "grad_norm": 0.07366437466259683, + "language_loss": 0.76944578, + "learning_rate": 0.000419045394489532, + "loss": 0.78034258, + "num_input_tokens_seen": 245053184, + "router_z_loss_mlp": 0.16503906, + "step": 2938, + "time_per_iteration": 2.6973941326141357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089785, + "balance_loss_mlp": 1.07220173, + "epoch": 0.5654097729896114, + "flos": 820648060416.0, + "grad_norm": 0.09626894788078913, + "language_loss": 0.76665318, + "learning_rate": 0.0004187379785098224, + "loss": 0.77755105, + "num_input_tokens_seen": 245137408, + "router_z_loss_mlp": 0.17602539, + "step": 2939, + "time_per_iteration": 3.165407657623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089382, + "balance_loss_mlp": 1.07268023, + "epoch": 0.565602154674875, + "flos": 784156723200.0, + "grad_norm": 0.07214080103004945, + "language_loss": 0.83462155, + "learning_rate": 0.00041843059407882744, + "loss": 0.84551537, + "num_input_tokens_seen": 245215504, + "router_z_loss_mlp": 0.16711426, + "step": 2940, + "time_per_iteration": 2.9633572101593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086882, + "balance_loss_mlp": 1.06998992, + "epoch": 0.5657945363601385, + "flos": 549683117568.0, + "grad_norm": 0.07122107277750783, + "language_loss": 0.8230179, + "learning_rate": 0.0004181232413158842, + "loss": 0.83388674, + "num_input_tokens_seen": 245286032, + "router_z_loss_mlp": 0.16906738, + "step": 2941, + "time_per_iteration": 2.6848304271698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091116, + "balance_loss_mlp": 1.07422447, + "epoch": 0.5659869180454021, + "flos": 668126900736.0, + "grad_norm": 0.08263268782748946, + "language_loss": 0.82281923, + "learning_rate": 0.0004178159203403179, + "loss": 0.83373046, + "num_input_tokens_seen": 245359040, + "router_z_loss_mlp": 0.16906738, + "step": 2942, + "time_per_iteration": 2.84724760055542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090202, + "balance_loss_mlp": 1.07366729, + "epoch": 0.5661792997306656, + "flos": 499955369472.0, + "grad_norm": 0.06696308597668005, + "language_loss": 0.81382257, + "learning_rate": 0.0004175086312714409, + "loss": 0.82472456, + "num_input_tokens_seen": 245426384, + "router_z_loss_mlp": 0.16540527, + "step": 2943, + "time_per_iteration": 2.582885265350342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092361, + "balance_loss_mlp": 1.0759573, + "epoch": 0.5663716814159292, + "flos": 601209589248.0, + "grad_norm": 0.060450118167724956, + "language_loss": 0.83769757, + "learning_rate": 0.00041720137422855366, + "loss": 0.84862119, + "num_input_tokens_seen": 245501216, + "router_z_loss_mlp": 0.1640625, + "step": 2944, + "time_per_iteration": 2.771480083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095642, + "balance_loss_mlp": 1.0798583, + "epoch": 0.5665640631011928, + "flos": 540988305408.0, + "grad_norm": 0.26231884968371866, + "language_loss": 0.7874673, + "learning_rate": 0.00041689414933094383, + "loss": 0.79842371, + "num_input_tokens_seen": 245571600, + "router_z_loss_mlp": 0.15771484, + "step": 2945, + "time_per_iteration": 2.6965370178222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096629, + "balance_loss_mlp": 1.08027291, + "epoch": 0.5667564447864564, + "flos": 601936054272.0, + "grad_norm": 0.08450400231002299, + "language_loss": 0.81155264, + "learning_rate": 0.00041658695669788653, + "loss": 0.82251894, + "num_input_tokens_seen": 245645632, + "router_z_loss_mlp": 0.16357422, + "step": 2946, + "time_per_iteration": 2.727442741394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105563, + "balance_loss_mlp": 1.08905292, + "epoch": 0.5669488264717198, + "flos": 659523492864.0, + "grad_norm": 0.08705150140664149, + "language_loss": 0.81145883, + "learning_rate": 0.00041627979644864453, + "loss": 0.82251441, + "num_input_tokens_seen": 245715776, + "router_z_loss_mlp": 0.16516113, + "step": 2947, + "time_per_iteration": 2.8466544151306152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112751, + "balance_loss_mlp": 1.0964433, + "epoch": 0.5671412081569834, + "flos": 485402222592.0, + "grad_norm": 0.062214847979028806, + "language_loss": 0.8092283, + "learning_rate": 0.0004159726687024683, + "loss": 0.82035577, + "num_input_tokens_seen": 245785328, + "router_z_loss_mlp": 0.16308594, + "step": 2948, + "time_per_iteration": 2.649352788925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118496, + "balance_loss_mlp": 1.10242701, + "epoch": 0.567333589842247, + "flos": 729801114624.0, + "grad_norm": 0.09810621328318807, + "language_loss": 0.79565436, + "learning_rate": 0.00041566557357859506, + "loss": 0.80683935, + "num_input_tokens_seen": 245858000, + "router_z_loss_mlp": 0.16064453, + "step": 2949, + "time_per_iteration": 2.9100704193115234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128592, + "balance_loss_mlp": 1.11225998, + "epoch": 0.5675259715275106, + "flos": 968887526400.0, + "grad_norm": 0.08040833195953295, + "language_loss": 0.79227537, + "learning_rate": 0.0004153585111962502, + "loss": 0.80356133, + "num_input_tokens_seen": 245950640, + "router_z_loss_mlp": 0.16333008, + "step": 2950, + "time_per_iteration": 3.332738161087036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135277, + "balance_loss_mlp": 1.11884952, + "epoch": 0.5677183532127742, + "flos": 565145538048.0, + "grad_norm": 0.06937214621935889, + "language_loss": 0.84358597, + "learning_rate": 0.0004150514816746453, + "loss": 0.85493875, + "num_input_tokens_seen": 246019568, + "router_z_loss_mlp": 0.16418457, + "step": 2951, + "time_per_iteration": 2.712589979171753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138042, + "balance_loss_mlp": 1.12165022, + "epoch": 0.5679107348980377, + "flos": 551694385152.0, + "grad_norm": 0.07032847030676616, + "language_loss": 0.85400414, + "learning_rate": 0.0004147444851329802, + "loss": 0.86538458, + "num_input_tokens_seen": 246089520, + "router_z_loss_mlp": 0.16394043, + "step": 2952, + "time_per_iteration": 2.6828949451446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147692, + "balance_loss_mlp": 1.13107419, + "epoch": 0.5681031165833013, + "flos": 819459804672.0, + "grad_norm": 0.07370144055460691, + "language_loss": 0.85637259, + "learning_rate": 0.00041443752169044126, + "loss": 0.86784947, + "num_input_tokens_seen": 246165920, + "router_z_loss_mlp": 0.16625977, + "step": 2953, + "time_per_iteration": 3.0499908924102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01156702, + "balance_loss_mlp": 1.13983333, + "epoch": 0.5682954982685648, + "flos": 618013711872.0, + "grad_norm": 0.07840541898783242, + "language_loss": 0.84904528, + "learning_rate": 0.0004141305914662025, + "loss": 0.86061233, + "num_input_tokens_seen": 246238672, + "router_z_loss_mlp": 0.16882324, + "step": 2954, + "time_per_iteration": 2.732133626937866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135414, + "balance_loss_mlp": 1.1186291, + "epoch": 0.5684878799538284, + "flos": 647949984768.0, + "grad_norm": 0.0690175597343332, + "language_loss": 0.80056989, + "learning_rate": 0.0004138236945794246, + "loss": 0.81192404, + "num_input_tokens_seen": 246320208, + "router_z_loss_mlp": 0.16784668, + "step": 2955, + "time_per_iteration": 2.920898914337158 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127256, + "balance_loss_mlp": 1.1108526, + "epoch": 0.5686802616390919, + "flos": 805961664000.0, + "grad_norm": 0.09346989124624208, + "language_loss": 0.83651698, + "learning_rate": 0.00041351683114925576, + "loss": 0.84778959, + "num_input_tokens_seen": 246406464, + "router_z_loss_mlp": 0.1640625, + "step": 2956, + "time_per_iteration": 3.1179428100585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122658, + "balance_loss_mlp": 1.10612392, + "epoch": 0.5688726433243555, + "flos": 547140676608.0, + "grad_norm": 0.07393250127791023, + "language_loss": 0.86702883, + "learning_rate": 0.0004132100012948308, + "loss": 0.87825537, + "num_input_tokens_seen": 246477456, + "router_z_loss_mlp": 0.16540527, + "step": 2957, + "time_per_iteration": 2.6336829662323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127835, + "balance_loss_mlp": 1.11014426, + "epoch": 0.5690650250096191, + "flos": 486568456704.0, + "grad_norm": 0.08317259373738083, + "language_loss": 0.84444946, + "learning_rate": 0.00041290320513527145, + "loss": 0.85572779, + "num_input_tokens_seen": 246541744, + "router_z_loss_mlp": 0.17712402, + "step": 2958, + "time_per_iteration": 2.641665458679199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123865, + "balance_loss_mlp": 1.10708022, + "epoch": 0.5692574066948827, + "flos": 577457620992.0, + "grad_norm": 0.07155108401540258, + "language_loss": 0.8494001, + "learning_rate": 0.0004125964427896867, + "loss": 0.86063874, + "num_input_tokens_seen": 246611440, + "router_z_loss_mlp": 0.16796875, + "step": 2959, + "time_per_iteration": 2.6707890033721924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111469, + "balance_loss_mlp": 1.09486318, + "epoch": 0.5694497883801463, + "flos": 454247585280.0, + "grad_norm": 0.06610188466362152, + "language_loss": 0.79023135, + "learning_rate": 0.0004122897143771723, + "loss": 0.80134606, + "num_input_tokens_seen": 246676496, + "router_z_loss_mlp": 0.1661377, + "step": 2960, + "time_per_iteration": 2.564518690109253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113363, + "balance_loss_mlp": 1.09644711, + "epoch": 0.5696421700654097, + "flos": 559516999680.0, + "grad_norm": 0.06798711275929166, + "language_loss": 0.81482321, + "learning_rate": 0.0004119830200168109, + "loss": 0.82595682, + "num_input_tokens_seen": 246746464, + "router_z_loss_mlp": 0.16931152, + "step": 2961, + "time_per_iteration": 2.6972579956054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119725, + "balance_loss_mlp": 1.10334563, + "epoch": 0.5698345517506733, + "flos": 465551649792.0, + "grad_norm": 0.08529196588510703, + "language_loss": 0.88292432, + "learning_rate": 0.0004116763598276714, + "loss": 0.89412153, + "num_input_tokens_seen": 246811808, + "router_z_loss_mlp": 0.16381836, + "step": 2962, + "time_per_iteration": 2.5670664310455322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110605, + "balance_loss_mlp": 1.09353447, + "epoch": 0.5700269334359369, + "flos": 605953446912.0, + "grad_norm": 0.06258641476293567, + "language_loss": 0.80866015, + "learning_rate": 0.00041136973392881017, + "loss": 0.81976616, + "num_input_tokens_seen": 246890432, + "router_z_loss_mlp": 0.17077637, + "step": 2963, + "time_per_iteration": 2.883714437484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106776, + "balance_loss_mlp": 1.08975244, + "epoch": 0.5702193151212005, + "flos": 562709182464.0, + "grad_norm": 0.07231503990514958, + "language_loss": 0.81792593, + "learning_rate": 0.00041106314243926983, + "loss": 0.82899374, + "num_input_tokens_seen": 246959616, + "router_z_loss_mlp": 0.17041016, + "step": 2964, + "time_per_iteration": 2.7783985137939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105614, + "balance_loss_mlp": 1.08862686, + "epoch": 0.570411696806464, + "flos": 523247745024.0, + "grad_norm": 0.0703519634607743, + "language_loss": 0.87298268, + "learning_rate": 0.0004107565854780798, + "loss": 0.88403881, + "num_input_tokens_seen": 247030656, + "router_z_loss_mlp": 0.17004395, + "step": 2965, + "time_per_iteration": 2.6647095680236816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105743, + "balance_loss_mlp": 1.08862448, + "epoch": 0.5706040784917276, + "flos": 718222837248.0, + "grad_norm": 0.10409226913166654, + "language_loss": 0.81182659, + "learning_rate": 0.000410450063164256, + "loss": 0.82288408, + "num_input_tokens_seen": 247105872, + "router_z_loss_mlp": 0.17126465, + "step": 2966, + "time_per_iteration": 2.866602659225464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104134, + "balance_loss_mlp": 1.08703911, + "epoch": 0.5707964601769911, + "flos": 476707410432.0, + "grad_norm": 0.07688057786324835, + "language_loss": 0.82004988, + "learning_rate": 0.00041014357561680115, + "loss": 0.83109128, + "num_input_tokens_seen": 247170448, + "router_z_loss_mlp": 0.17114258, + "step": 2967, + "time_per_iteration": 2.5523133277893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109926, + "balance_loss_mlp": 1.09312987, + "epoch": 0.5709888418622547, + "flos": 580101378048.0, + "grad_norm": 0.0904159605578498, + "language_loss": 0.86166346, + "learning_rate": 0.0004098371229547039, + "loss": 0.87276274, + "num_input_tokens_seen": 247240400, + "router_z_loss_mlp": 0.16809082, + "step": 2968, + "time_per_iteration": 2.724207878112793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_mlp": 1.022156, + "epoch": 0.5711812235475183, + "flos": 1579922910720.0, + "grad_norm": 0.013041633212772678, + "language_loss": 0.80010808, + "learning_rate": 0.0004095307052969399, + "loss": 0.81042308, + "num_input_tokens_seen": 247469136, + "router_z_loss_mlp": 0.09326172, + "step": 2969, + "time_per_iteration": 4.806856155395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113794, + "balance_loss_mlp": 1.09678328, + "epoch": 0.5713736052327818, + "flos": 468506695680.0, + "grad_norm": 0.07993701822539574, + "language_loss": 0.80239302, + "learning_rate": 0.00040922432276247107, + "loss": 0.81353092, + "num_input_tokens_seen": 247537712, + "router_z_loss_mlp": 0.17028809, + "step": 2970, + "time_per_iteration": 2.603079319000244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119244, + "balance_loss_mlp": 1.1021136, + "epoch": 0.5715659869180454, + "flos": 537662499840.0, + "grad_norm": 0.07050688201783964, + "language_loss": 0.84539342, + "learning_rate": 0.0004089179754702457, + "loss": 0.85658586, + "num_input_tokens_seen": 247613872, + "router_z_loss_mlp": 0.17150879, + "step": 2971, + "time_per_iteration": 2.806685209274292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125569, + "balance_loss_mlp": 1.10841513, + "epoch": 0.571758368603309, + "flos": 656071778304.0, + "grad_norm": 0.1127525051095751, + "language_loss": 0.79654694, + "learning_rate": 0.00040861166353919843, + "loss": 0.80780256, + "num_input_tokens_seen": 247686064, + "router_z_loss_mlp": 0.17175293, + "step": 2972, + "time_per_iteration": 2.822960138320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122737, + "balance_loss_mlp": 1.10572612, + "epoch": 0.5719507502885726, + "flos": 667907016192.0, + "grad_norm": 0.06522156109142956, + "language_loss": 0.81529987, + "learning_rate": 0.00040830538708824983, + "loss": 0.8265273, + "num_input_tokens_seen": 247760384, + "router_z_loss_mlp": 0.17028809, + "step": 2973, + "time_per_iteration": 2.883183479309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114225, + "balance_loss_mlp": 1.09716594, + "epoch": 0.572143131973836, + "flos": 476321969664.0, + "grad_norm": 0.05988777943056807, + "language_loss": 0.81712234, + "learning_rate": 0.000407999146236307, + "loss": 0.82826465, + "num_input_tokens_seen": 247824768, + "router_z_loss_mlp": 0.17077637, + "step": 2974, + "time_per_iteration": 2.583639144897461 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113969, + "balance_loss_mlp": 1.09735084, + "epoch": 0.5723355136590996, + "flos": 539510782464.0, + "grad_norm": 0.08488733778098946, + "language_loss": 0.83322281, + "learning_rate": 0.0004076929411022634, + "loss": 0.84436244, + "num_input_tokens_seen": 247894448, + "router_z_loss_mlp": 0.16625977, + "step": 2975, + "time_per_iteration": 2.6634230613708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117906, + "balance_loss_mlp": 1.10096645, + "epoch": 0.5725278953443632, + "flos": 824156674560.0, + "grad_norm": 0.10471513442043413, + "language_loss": 0.7910713, + "learning_rate": 0.0004073867718049982, + "loss": 0.80225033, + "num_input_tokens_seen": 247976432, + "router_z_loss_mlp": 0.16955566, + "step": 2976, + "time_per_iteration": 3.101864814758301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116871, + "balance_loss_mlp": 1.10026503, + "epoch": 0.5727202770296268, + "flos": 587437235712.0, + "grad_norm": 0.08664196816998121, + "language_loss": 0.82484782, + "learning_rate": 0.00040708063846337704, + "loss": 0.83601654, + "num_input_tokens_seen": 248048800, + "router_z_loss_mlp": 0.1661377, + "step": 2977, + "time_per_iteration": 2.7438297271728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106932, + "balance_loss_mlp": 1.08967066, + "epoch": 0.5729126587148904, + "flos": 446966055936.0, + "grad_norm": 0.07799786255299582, + "language_loss": 0.81199914, + "learning_rate": 0.00040677454119625143, + "loss": 0.8230685, + "num_input_tokens_seen": 248116496, + "router_z_loss_mlp": 0.17285156, + "step": 2978, + "time_per_iteration": 2.5837550163269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095322, + "balance_loss_mlp": 1.07809663, + "epoch": 0.5731050404001539, + "flos": 519457577472.0, + "grad_norm": 0.1059947946829761, + "language_loss": 0.82621056, + "learning_rate": 0.0004064684801224587, + "loss": 0.83716381, + "num_input_tokens_seen": 248184960, + "router_z_loss_mlp": 0.17236328, + "step": 2979, + "time_per_iteration": 2.6220715045928955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095905, + "balance_loss_mlp": 1.07850003, + "epoch": 0.5732974220854175, + "flos": 504775950336.0, + "grad_norm": 0.06700215842091113, + "language_loss": 0.80611891, + "learning_rate": 0.00040616245536082224, + "loss": 0.81707793, + "num_input_tokens_seen": 248252208, + "router_z_loss_mlp": 0.17431641, + "step": 2980, + "time_per_iteration": 2.6067917346954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086913, + "balance_loss_mlp": 1.069556, + "epoch": 0.573489803770681, + "flos": 592485041664.0, + "grad_norm": 0.19945027498537377, + "language_loss": 0.81268358, + "learning_rate": 0.00040585646703015165, + "loss": 0.82355273, + "num_input_tokens_seen": 248333312, + "router_z_loss_mlp": 0.17370605, + "step": 2981, + "time_per_iteration": 2.910644769668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087867, + "balance_loss_mlp": 1.07096314, + "epoch": 0.5736821854559446, + "flos": 489911514624.0, + "grad_norm": 0.06421268852729406, + "language_loss": 0.78161913, + "learning_rate": 0.0004055505152492419, + "loss": 0.79249781, + "num_input_tokens_seen": 248403808, + "router_z_loss_mlp": 0.16918945, + "step": 2982, + "time_per_iteration": 2.6653785705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084086, + "balance_loss_mlp": 1.06670547, + "epoch": 0.5738745671412081, + "flos": 458156321280.0, + "grad_norm": 0.08054865949602324, + "language_loss": 0.73896229, + "learning_rate": 0.00040524460013687425, + "loss": 0.74980319, + "num_input_tokens_seen": 248477184, + "router_z_loss_mlp": 0.1739502, + "step": 2983, + "time_per_iteration": 2.721282958984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090667, + "balance_loss_mlp": 1.07357204, + "epoch": 0.5740669488264717, + "flos": 580333372416.0, + "grad_norm": 0.08106324915579151, + "language_loss": 0.81038249, + "learning_rate": 0.0004049387218118155, + "loss": 0.82128918, + "num_input_tokens_seen": 248565552, + "router_z_loss_mlp": 0.17102051, + "step": 2984, + "time_per_iteration": 2.9739558696746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109026, + "balance_loss_mlp": 1.07321286, + "epoch": 0.5742593305117353, + "flos": 524438572032.0, + "grad_norm": 0.07771926917330779, + "language_loss": 0.84678066, + "learning_rate": 0.00040463288039281777, + "loss": 0.85768324, + "num_input_tokens_seen": 248635456, + "router_z_loss_mlp": 0.1706543, + "step": 2985, + "time_per_iteration": 2.755789279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049819, + "balance_loss_mlp": 1.0396148, + "epoch": 0.5744517121969989, + "flos": 1553877748224.0, + "grad_norm": 0.027186215876947157, + "language_loss": 0.77876419, + "learning_rate": 0.0004043270759986194, + "loss": 0.78926235, + "num_input_tokens_seen": 248870160, + "router_z_loss_mlp": 0.10205078, + "step": 2986, + "time_per_iteration": 5.024104833602905 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102309, + "balance_loss_mlp": 1.08496404, + "epoch": 0.5746440938822625, + "flos": 751919915520.0, + "grad_norm": 0.07406110021904912, + "language_loss": 0.82250667, + "learning_rate": 0.0004040213087479444, + "loss": 0.83352977, + "num_input_tokens_seen": 248946960, + "router_z_loss_mlp": 0.17346191, + "step": 2987, + "time_per_iteration": 2.954012632369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110505, + "balance_loss_mlp": 1.0885036, + "epoch": 0.5748364755675259, + "flos": 501865320960.0, + "grad_norm": 0.08213209001088305, + "language_loss": 0.85105377, + "learning_rate": 0.0004037155787595018, + "loss": 0.86210424, + "num_input_tokens_seen": 249014128, + "router_z_loss_mlp": 0.16552734, + "step": 2988, + "time_per_iteration": 2.596590757369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103556, + "balance_loss_mlp": 1.08671117, + "epoch": 0.5750288572527895, + "flos": 504044342784.0, + "grad_norm": 0.06658279323725882, + "language_loss": 0.80333447, + "learning_rate": 0.000403409886151987, + "loss": 0.8143701, + "num_input_tokens_seen": 249090016, + "router_z_loss_mlp": 0.1685791, + "step": 2989, + "time_per_iteration": 2.9190666675567627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049496, + "balance_loss_mlp": 1.03948224, + "epoch": 0.5752212389380531, + "flos": 1541365604352.0, + "grad_norm": 0.024963739862010757, + "language_loss": 0.81999105, + "learning_rate": 0.0004031042310440799, + "loss": 0.830486, + "num_input_tokens_seen": 249305552, + "router_z_loss_mlp": 0.10009766, + "step": 2990, + "time_per_iteration": 4.779403448104858 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_mlp": 1.03442252, + "epoch": 0.5754136206233167, + "flos": 1567331472384.0, + "grad_norm": 0.02279292821926405, + "language_loss": 0.781986, + "learning_rate": 0.00040279861355444656, + "loss": 0.79242849, + "num_input_tokens_seen": 249523408, + "router_z_loss_mlp": 0.09814453, + "step": 2991, + "time_per_iteration": 4.813813209533691 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104349, + "balance_loss_mlp": 1.08761191, + "epoch": 0.5756060023085803, + "flos": 798156301824.0, + "grad_norm": 0.07351496217070447, + "language_loss": 0.76526999, + "learning_rate": 0.00040249303380173807, + "loss": 0.77631354, + "num_input_tokens_seen": 249616624, + "router_z_loss_mlp": 0.16748047, + "step": 2992, + "time_per_iteration": 3.0984480381011963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099869, + "balance_loss_mlp": 1.08323884, + "epoch": 0.5757983839938438, + "flos": 587877004800.0, + "grad_norm": 0.07106147833910306, + "language_loss": 0.78964388, + "learning_rate": 0.00040218749190459126, + "loss": 0.80064261, + "num_input_tokens_seen": 249689936, + "router_z_loss_mlp": 0.16638184, + "step": 2993, + "time_per_iteration": 2.7525393962860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109225, + "balance_loss_mlp": 1.07550144, + "epoch": 0.5759907656791073, + "flos": 516831072768.0, + "grad_norm": 0.07997694276494066, + "language_loss": 0.82424486, + "learning_rate": 0.00040188198798162775, + "loss": 0.83516741, + "num_input_tokens_seen": 249759984, + "router_z_loss_mlp": 0.16760254, + "step": 2994, + "time_per_iteration": 2.6026856899261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105077, + "balance_loss_mlp": 1.08812571, + "epoch": 0.5761831473643709, + "flos": 587133287424.0, + "grad_norm": 0.060991263028610375, + "language_loss": 0.85548359, + "learning_rate": 0.000401576522151455, + "loss": 0.86653435, + "num_input_tokens_seen": 249837888, + "router_z_loss_mlp": 0.16955566, + "step": 2995, + "time_per_iteration": 2.8387343883514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097873, + "balance_loss_mlp": 1.08148181, + "epoch": 0.5763755290496345, + "flos": 543896363520.0, + "grad_norm": 0.0649014718190417, + "language_loss": 0.82459986, + "learning_rate": 0.0004012710945326651, + "loss": 0.83557856, + "num_input_tokens_seen": 249913584, + "router_z_loss_mlp": 0.16394043, + "step": 2996, + "time_per_iteration": 2.8002259731292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099492, + "balance_loss_mlp": 1.08355331, + "epoch": 0.576567910734898, + "flos": 626229107712.0, + "grad_norm": 0.07884412717722156, + "language_loss": 0.80980134, + "learning_rate": 0.0004009657052438355, + "loss": 0.82079625, + "num_input_tokens_seen": 249992144, + "router_z_loss_mlp": 0.15930176, + "step": 2997, + "time_per_iteration": 2.8380162715911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106429, + "balance_loss_mlp": 1.09044361, + "epoch": 0.5767602924201616, + "flos": 538243232256.0, + "grad_norm": 0.09100511136442054, + "language_loss": 0.8548094, + "learning_rate": 0.00040066035440352904, + "loss": 0.86587369, + "num_input_tokens_seen": 250060736, + "router_z_loss_mlp": 0.15979004, + "step": 2998, + "time_per_iteration": 2.7165040969848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_mlp": 1.04687226, + "epoch": 0.5769526741054252, + "flos": 1559778301440.0, + "grad_norm": 0.029413044868518267, + "language_loss": 0.79293132, + "learning_rate": 0.0004003550421302934, + "loss": 0.80347776, + "num_input_tokens_seen": 250296864, + "router_z_loss_mlp": 0.07763672, + "step": 2999, + "time_per_iteration": 4.891362905502319 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105098, + "balance_loss_mlp": 1.08894527, + "epoch": 0.5771450557906888, + "flos": 468185495040.0, + "grad_norm": 0.08263350927787948, + "language_loss": 0.75637519, + "learning_rate": 0.00040004976854266145, + "loss": 0.76742619, + "num_input_tokens_seen": 250362528, + "router_z_loss_mlp": 0.16149902, + "step": 3000, + "time_per_iteration": 2.5579755306243896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105893, + "balance_loss_mlp": 1.08987141, + "epoch": 0.5773374374759523, + "flos": 574556903424.0, + "grad_norm": 0.06941869769704709, + "language_loss": 0.81322896, + "learning_rate": 0.0003997445337591505, + "loss": 0.82428795, + "num_input_tokens_seen": 250432768, + "router_z_loss_mlp": 0.16027832, + "step": 3001, + "time_per_iteration": 2.689349889755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104956, + "balance_loss_mlp": 1.0884937, + "epoch": 0.5775298191612158, + "flos": 528473590272.0, + "grad_norm": 0.09192868754767076, + "language_loss": 0.74184531, + "learning_rate": 0.0003994393378982635, + "loss": 0.75289488, + "num_input_tokens_seen": 250501504, + "router_z_loss_mlp": 0.16467285, + "step": 3002, + "time_per_iteration": 2.6561992168426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074164, + "balance_loss_mlp": 1.06658196, + "epoch": 0.5777222008464794, + "flos": 1303919700480.0, + "grad_norm": 0.035051917356449074, + "language_loss": 0.79538, + "learning_rate": 0.00039913418107848786, + "loss": 0.80612171, + "num_input_tokens_seen": 250733632, + "router_z_loss_mlp": 0.07568359, + "step": 3003, + "time_per_iteration": 4.835859298706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101477, + "balance_loss_mlp": 1.0852406, + "epoch": 0.577914582531743, + "flos": 603633461760.0, + "grad_norm": 0.07939797508674061, + "language_loss": 0.8815853, + "learning_rate": 0.0003988290634182961, + "loss": 0.89260006, + "num_input_tokens_seen": 250809152, + "router_z_loss_mlp": 0.16235352, + "step": 3004, + "time_per_iteration": 2.8315813541412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106342, + "balance_loss_mlp": 1.09034419, + "epoch": 0.5781069642170066, + "flos": 486795681792.0, + "grad_norm": 0.07086440080231367, + "language_loss": 0.80762905, + "learning_rate": 0.0003985239850361453, + "loss": 0.81869251, + "num_input_tokens_seen": 250879152, + "router_z_loss_mlp": 0.15991211, + "step": 3005, + "time_per_iteration": 2.6647462844848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.08430243, + "epoch": 0.5782993459022701, + "flos": 506295318528.0, + "grad_norm": 0.07031230145466298, + "language_loss": 0.84713155, + "learning_rate": 0.0003982189460504777, + "loss": 0.85813624, + "num_input_tokens_seen": 250949904, + "router_z_loss_mlp": 0.16162109, + "step": 3006, + "time_per_iteration": 2.70588755607605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104818, + "balance_loss_mlp": 1.08837891, + "epoch": 0.5784917275875336, + "flos": 602155938816.0, + "grad_norm": 0.07782537057878013, + "language_loss": 0.78822792, + "learning_rate": 0.00039791394657971935, + "loss": 0.79927599, + "num_input_tokens_seen": 251020976, + "router_z_loss_mlp": 0.16442871, + "step": 3007, + "time_per_iteration": 2.7525734901428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112062, + "balance_loss_mlp": 1.09575403, + "epoch": 0.5786841092727972, + "flos": 521540425728.0, + "grad_norm": 0.08023947055085524, + "language_loss": 0.84335512, + "learning_rate": 0.00039760898674228205, + "loss": 0.85447574, + "num_input_tokens_seen": 251093280, + "router_z_loss_mlp": 0.16308594, + "step": 3008, + "time_per_iteration": 2.6740429401397705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.08913136, + "epoch": 0.5788764909580608, + "flos": 767404357632.0, + "grad_norm": 0.06481055961735596, + "language_loss": 0.80689526, + "learning_rate": 0.0003973040666565613, + "loss": 0.81794715, + "num_input_tokens_seen": 251181376, + "router_z_loss_mlp": 0.16052246, + "step": 3009, + "time_per_iteration": 3.0985798835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105331, + "balance_loss_mlp": 1.08880866, + "epoch": 0.5790688726433244, + "flos": 599094434304.0, + "grad_norm": 0.07104717657711816, + "language_loss": 0.8190769, + "learning_rate": 0.000396999186440938, + "loss": 0.83013022, + "num_input_tokens_seen": 251256176, + "router_z_loss_mlp": 0.1652832, + "step": 3010, + "time_per_iteration": 2.8631935119628906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095613, + "balance_loss_mlp": 1.07888842, + "epoch": 0.5792612543285879, + "flos": 523064936448.0, + "grad_norm": 0.07539914783858101, + "language_loss": 0.85185289, + "learning_rate": 0.000396694346213777, + "loss": 0.86280894, + "num_input_tokens_seen": 251325344, + "router_z_loss_mlp": 0.1673584, + "step": 3011, + "time_per_iteration": 2.7040622234344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093449, + "balance_loss_mlp": 1.0765686, + "epoch": 0.5794536360138515, + "flos": 876557915136.0, + "grad_norm": 0.06256207841015303, + "language_loss": 0.83364058, + "learning_rate": 0.0003963895460934276, + "loss": 0.84457505, + "num_input_tokens_seen": 251406656, + "router_z_loss_mlp": 0.16882324, + "step": 3012, + "time_per_iteration": 3.173614025115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.07312369, + "epoch": 0.5796460176991151, + "flos": 401436311040.0, + "grad_norm": 0.08299946451997237, + "language_loss": 0.85058802, + "learning_rate": 0.00039608478619822376, + "loss": 0.86148685, + "num_input_tokens_seen": 251467760, + "router_z_loss_mlp": 0.16772461, + "step": 3013, + "time_per_iteration": 2.4611692428588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.065166, + "epoch": 0.5798383993843786, + "flos": 618517721088.0, + "grad_norm": 0.06639451681987794, + "language_loss": 0.82375103, + "learning_rate": 0.00039578006664648394, + "loss": 0.83457041, + "num_input_tokens_seen": 251542272, + "router_z_loss_mlp": 0.16784668, + "step": 3014, + "time_per_iteration": 2.789212703704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085955, + "balance_loss_mlp": 1.06965923, + "epoch": 0.5800307810696421, + "flos": 844331019264.0, + "grad_norm": 0.08034627380925646, + "language_loss": 0.81074166, + "learning_rate": 0.0003954753875565105, + "loss": 0.82160121, + "num_input_tokens_seen": 251625584, + "router_z_loss_mlp": 0.16296387, + "step": 3015, + "time_per_iteration": 3.1160459518432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082316, + "balance_loss_mlp": 1.06503117, + "epoch": 0.5802231627549057, + "flos": 569276729856.0, + "grad_norm": 0.06677664636320767, + "language_loss": 0.82464337, + "learning_rate": 0.00039517074904659057, + "loss": 0.83546656, + "num_input_tokens_seen": 251696704, + "router_z_loss_mlp": 0.1730957, + "step": 3016, + "time_per_iteration": 2.716564655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087588, + "balance_loss_mlp": 1.07085133, + "epoch": 0.5804155444401693, + "flos": 660459930624.0, + "grad_norm": 0.0799627957481028, + "language_loss": 0.84913206, + "learning_rate": 0.00039486615123499535, + "loss": 0.86000794, + "num_input_tokens_seen": 251774784, + "router_z_loss_mlp": 0.16748047, + "step": 3017, + "time_per_iteration": 2.855402708053589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079434, + "balance_loss_mlp": 1.06237507, + "epoch": 0.5806079261254329, + "flos": 513992024064.0, + "grad_norm": 0.08435209251616928, + "language_loss": 0.85015523, + "learning_rate": 0.00039456159423997996, + "loss": 0.86094958, + "num_input_tokens_seen": 251844768, + "router_z_loss_mlp": 0.17077637, + "step": 3018, + "time_per_iteration": 2.6843197345733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079601, + "balance_loss_mlp": 1.06261373, + "epoch": 0.5808003078106965, + "flos": 528646487040.0, + "grad_norm": 0.06274674533021377, + "language_loss": 0.89687812, + "learning_rate": 0.00039425707817978406, + "loss": 0.90767419, + "num_input_tokens_seen": 251912736, + "router_z_loss_mlp": 0.16992188, + "step": 3019, + "time_per_iteration": 2.681183099746704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076495, + "balance_loss_mlp": 1.05895901, + "epoch": 0.58099268949596, + "flos": 477028611072.0, + "grad_norm": 0.14184929094941942, + "language_loss": 0.83556581, + "learning_rate": 0.00039395260317263124, + "loss": 0.84633076, + "num_input_tokens_seen": 251979328, + "router_z_loss_mlp": 0.17553711, + "step": 3020, + "time_per_iteration": 2.629709482192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073542, + "balance_loss_mlp": 1.05577993, + "epoch": 0.5811850711812235, + "flos": 517609294848.0, + "grad_norm": 0.08203162266100236, + "language_loss": 0.84840143, + "learning_rate": 0.0003936481693367291, + "loss": 0.85913682, + "num_input_tokens_seen": 252050928, + "router_z_loss_mlp": 0.1776123, + "step": 3021, + "time_per_iteration": 2.717710018157959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083798, + "balance_loss_mlp": 1.06607115, + "epoch": 0.5813774528664871, + "flos": 616422389760.0, + "grad_norm": 0.08298145922497896, + "language_loss": 0.87323809, + "learning_rate": 0.0003933437767902697, + "loss": 0.88407612, + "num_input_tokens_seen": 252126496, + "router_z_loss_mlp": 0.17749023, + "step": 3022, + "time_per_iteration": 2.8179917335510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.07563782, + "epoch": 0.5815698345517507, + "flos": 567475435008.0, + "grad_norm": 0.07663513037653054, + "language_loss": 0.77978808, + "learning_rate": 0.00039303942565142825, + "loss": 0.79071838, + "num_input_tokens_seen": 252203008, + "router_z_loss_mlp": 0.17407227, + "step": 3023, + "time_per_iteration": 2.7656824588775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092602, + "balance_loss_mlp": 1.07522154, + "epoch": 0.5817622162370142, + "flos": 563168775168.0, + "grad_norm": 0.09353579288790682, + "language_loss": 0.76389718, + "learning_rate": 0.0003927351160383644, + "loss": 0.77482319, + "num_input_tokens_seen": 252283440, + "router_z_loss_mlp": 0.1739502, + "step": 3024, + "time_per_iteration": 2.81196665763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096766, + "balance_loss_mlp": 1.07968342, + "epoch": 0.5819545979222778, + "flos": 459216470016.0, + "grad_norm": 0.05988996320852443, + "language_loss": 0.77658468, + "learning_rate": 0.000392430848069222, + "loss": 0.78755236, + "num_input_tokens_seen": 252351760, + "router_z_loss_mlp": 0.17089844, + "step": 3025, + "time_per_iteration": 2.553349733352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.07864261, + "epoch": 0.5821469796075414, + "flos": 541475062272.0, + "grad_norm": 0.09842162601860249, + "language_loss": 0.82432085, + "learning_rate": 0.00039212662186212795, + "loss": 0.83527917, + "num_input_tokens_seen": 252418480, + "router_z_loss_mlp": 0.17199707, + "step": 3026, + "time_per_iteration": 2.6321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096184, + "balance_loss_mlp": 1.07874346, + "epoch": 0.582339361292805, + "flos": 552262634496.0, + "grad_norm": 0.06216962714468932, + "language_loss": 0.77065325, + "learning_rate": 0.0003918224375351934, + "loss": 0.78161508, + "num_input_tokens_seen": 252493712, + "router_z_loss_mlp": 0.17468262, + "step": 3027, + "time_per_iteration": 2.7319040298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102446, + "balance_loss_mlp": 1.08531559, + "epoch": 0.5825317429780685, + "flos": 496399767552.0, + "grad_norm": 0.06463813423056745, + "language_loss": 0.78389823, + "learning_rate": 0.0003915182952065135, + "loss": 0.79492265, + "num_input_tokens_seen": 252566096, + "router_z_loss_mlp": 0.17138672, + "step": 3028, + "time_per_iteration": 2.6997907161712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.08095205, + "epoch": 0.582724124663332, + "flos": 564162112512.0, + "grad_norm": 0.07943165793883354, + "language_loss": 0.87551522, + "learning_rate": 0.0003912141949941664, + "loss": 0.8864941, + "num_input_tokens_seen": 252639424, + "router_z_loss_mlp": 0.16955566, + "step": 3029, + "time_per_iteration": 2.7122318744659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091171, + "balance_loss_mlp": 1.07376611, + "epoch": 0.5829165063485956, + "flos": 492132754944.0, + "grad_norm": 0.08419707099866325, + "language_loss": 0.82715654, + "learning_rate": 0.0003909101370162143, + "loss": 0.83806825, + "num_input_tokens_seen": 252706672, + "router_z_loss_mlp": 0.17431641, + "step": 3030, + "time_per_iteration": 2.6301612854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010224, + "balance_loss_mlp": 1.00211763, + "epoch": 0.5831088880338592, + "flos": 1528880997888.0, + "grad_norm": 0.006956762065680846, + "language_loss": 0.72433889, + "learning_rate": 0.00039060612139070326, + "loss": 0.73444116, + "num_input_tokens_seen": 252932464, + "router_z_loss_mlp": 0.08105469, + "step": 3031, + "time_per_iteration": 4.870691299438477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091064, + "balance_loss_mlp": 1.07400537, + "epoch": 0.5833012697191228, + "flos": 618011140608.0, + "grad_norm": 0.08204338633061625, + "language_loss": 0.82931381, + "learning_rate": 0.0003903021482356622, + "loss": 0.8402245, + "num_input_tokens_seen": 253011920, + "router_z_loss_mlp": 0.1706543, + "step": 3032, + "time_per_iteration": 2.829430103302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091732, + "balance_loss_mlp": 1.07503033, + "epoch": 0.5834936514043862, + "flos": 767920849920.0, + "grad_norm": 0.08520682753706012, + "language_loss": 0.82501173, + "learning_rate": 0.00038999821766910465, + "loss": 0.8359291, + "num_input_tokens_seen": 253091552, + "router_z_loss_mlp": 0.16711426, + "step": 3033, + "time_per_iteration": 3.0449070930480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087325, + "balance_loss_mlp": 1.07023025, + "epoch": 0.5836860330896498, + "flos": 458371436544.0, + "grad_norm": 0.07138585009560579, + "language_loss": 0.85493183, + "learning_rate": 0.00038969432980902606, + "loss": 0.86580509, + "num_input_tokens_seen": 253158608, + "router_z_loss_mlp": 0.17114258, + "step": 3034, + "time_per_iteration": 2.6099114418029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01015774, + "balance_loss_mlp": 1.00771523, + "epoch": 0.5838784147749134, + "flos": 1361225585664.0, + "grad_norm": 0.011956814182891856, + "language_loss": 0.79784501, + "learning_rate": 0.0003893904847734068, + "loss": 0.80800277, + "num_input_tokens_seen": 253381184, + "router_z_loss_mlp": 0.08056641, + "step": 3035, + "time_per_iteration": 4.919405460357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084349, + "balance_loss_mlp": 1.0678978, + "epoch": 0.584070796460177, + "flos": 567211133952.0, + "grad_norm": 0.0762930329312805, + "language_loss": 0.82252562, + "learning_rate": 0.00038908668268020953, + "loss": 0.83336914, + "num_input_tokens_seen": 253452880, + "router_z_loss_mlp": 0.16455078, + "step": 3036, + "time_per_iteration": 2.7005980014801025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.06603003, + "epoch": 0.5842631781454406, + "flos": 611483240448.0, + "grad_norm": 0.07750025430989764, + "language_loss": 0.84744304, + "learning_rate": 0.00038878292364738097, + "loss": 0.85826999, + "num_input_tokens_seen": 253530000, + "router_z_loss_mlp": 0.16674805, + "step": 3037, + "time_per_iteration": 2.854461908340454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085343, + "balance_loss_mlp": 1.0690949, + "epoch": 0.5844555598307041, + "flos": 463384737792.0, + "grad_norm": 0.0866866607830145, + "language_loss": 0.86865294, + "learning_rate": 0.0003884792077928508, + "loss": 0.87950635, + "num_input_tokens_seen": 253593504, + "router_z_loss_mlp": 0.16235352, + "step": 3038, + "time_per_iteration": 2.526219606399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085659, + "balance_loss_mlp": 1.06974506, + "epoch": 0.5846479415159677, + "flos": 410215186944.0, + "grad_norm": 0.09714525133414084, + "language_loss": 0.76819932, + "learning_rate": 0.0003881755352345322, + "loss": 0.77905595, + "num_input_tokens_seen": 253657904, + "router_z_loss_mlp": 0.15905762, + "step": 3039, + "time_per_iteration": 2.5546979904174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086917, + "balance_loss_mlp": 1.0702157, + "epoch": 0.5848403232012312, + "flos": 491297633280.0, + "grad_norm": 0.09749751366402076, + "language_loss": 0.86787152, + "learning_rate": 0.0003878719060903207, + "loss": 0.87874067, + "num_input_tokens_seen": 253725280, + "router_z_loss_mlp": 0.16711426, + "step": 3040, + "time_per_iteration": 2.585848093032837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091729, + "balance_loss_mlp": 1.07531416, + "epoch": 0.5850327048864948, + "flos": 584417949696.0, + "grad_norm": 0.0840209110893744, + "language_loss": 0.83088207, + "learning_rate": 0.0003875683204780961, + "loss": 0.84179938, + "num_input_tokens_seen": 253795040, + "router_z_loss_mlp": 0.16418457, + "step": 3041, + "time_per_iteration": 2.7646286487579346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096113, + "balance_loss_mlp": 1.08006763, + "epoch": 0.5852250865717584, + "flos": 651545233920.0, + "grad_norm": 0.08651728983241819, + "language_loss": 0.85210633, + "learning_rate": 0.00038726477851572043, + "loss": 0.86306751, + "num_input_tokens_seen": 253866384, + "router_z_loss_mlp": 0.16040039, + "step": 3042, + "time_per_iteration": 2.797314167022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101767, + "balance_loss_mlp": 1.08557868, + "epoch": 0.5854174682570219, + "flos": 534588885504.0, + "grad_norm": 0.08316199388994981, + "language_loss": 0.80228806, + "learning_rate": 0.0003869612803210395, + "loss": 0.81330574, + "num_input_tokens_seen": 253935712, + "router_z_loss_mlp": 0.16186523, + "step": 3043, + "time_per_iteration": 2.6490185260772705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103183, + "balance_loss_mlp": 1.08701873, + "epoch": 0.5856098499422855, + "flos": 509752175616.0, + "grad_norm": 0.06777837645025765, + "language_loss": 0.83051372, + "learning_rate": 0.0003866578260118817, + "loss": 0.84154546, + "num_input_tokens_seen": 254003152, + "router_z_loss_mlp": 0.16162109, + "step": 3044, + "time_per_iteration": 2.6326801776885986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106723, + "balance_loss_mlp": 1.09098744, + "epoch": 0.5858022316275491, + "flos": 593893555200.0, + "grad_norm": 0.07505807062734855, + "language_loss": 0.83121902, + "learning_rate": 0.0003863544157060581, + "loss": 0.84228623, + "num_input_tokens_seen": 254072816, + "router_z_loss_mlp": 0.15722656, + "step": 3045, + "time_per_iteration": 2.7122910022735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113369, + "balance_loss_mlp": 1.09763348, + "epoch": 0.5859946133128127, + "flos": 559126416384.0, + "grad_norm": 0.06825767558676081, + "language_loss": 0.81871521, + "learning_rate": 0.0003860510495213634, + "loss": 0.82984889, + "num_input_tokens_seen": 254152800, + "router_z_loss_mlp": 0.15722656, + "step": 3046, + "time_per_iteration": 2.8188610076904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113296, + "balance_loss_mlp": 1.09753644, + "epoch": 0.5861869949980761, + "flos": 553695740928.0, + "grad_norm": 0.07680372972712284, + "language_loss": 0.7820521, + "learning_rate": 0.0003857477275755746, + "loss": 0.79318506, + "num_input_tokens_seen": 254224384, + "router_z_loss_mlp": 0.1574707, + "step": 3047, + "time_per_iteration": 2.680021047592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114239, + "balance_loss_mlp": 1.09859896, + "epoch": 0.5863793766833397, + "flos": 718667375616.0, + "grad_norm": 0.06132573168351462, + "language_loss": 0.83483028, + "learning_rate": 0.00038544444998645167, + "loss": 0.84597266, + "num_input_tokens_seen": 254310960, + "router_z_loss_mlp": 0.15625, + "step": 3048, + "time_per_iteration": 3.024035692214966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110798, + "balance_loss_mlp": 1.09482431, + "epoch": 0.5865717583686033, + "flos": 472289522688.0, + "grad_norm": 0.07774154556799634, + "language_loss": 0.81755519, + "learning_rate": 0.00038514121687173767, + "loss": 0.82866311, + "num_input_tokens_seen": 254378336, + "router_z_loss_mlp": 0.15966797, + "step": 3049, + "time_per_iteration": 2.602348566055298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106413, + "balance_loss_mlp": 1.09079647, + "epoch": 0.5867641400538669, + "flos": 813482901504.0, + "grad_norm": 0.07288499528915, + "language_loss": 0.81607699, + "learning_rate": 0.00038483802834915807, + "loss": 0.82714111, + "num_input_tokens_seen": 254454352, + "router_z_loss_mlp": 0.15600586, + "step": 3050, + "time_per_iteration": 3.0202012062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102645, + "balance_loss_mlp": 1.08663559, + "epoch": 0.5869565217391305, + "flos": 486531380736.0, + "grad_norm": 0.06464020852625685, + "language_loss": 0.78985357, + "learning_rate": 0.00038453488453642074, + "loss": 0.80088001, + "num_input_tokens_seen": 254526352, + "router_z_loss_mlp": 0.16003418, + "step": 3051, + "time_per_iteration": 2.6733691692352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_mlp": 1.0853616, + "epoch": 0.587148903424394, + "flos": 569385386496.0, + "grad_norm": 0.11499584820010532, + "language_loss": 0.86622018, + "learning_rate": 0.00038423178555121697, + "loss": 0.87723207, + "num_input_tokens_seen": 254598720, + "router_z_loss_mlp": 0.1583252, + "step": 3052, + "time_per_iteration": 2.7339212894439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091842, + "balance_loss_mlp": 1.07583237, + "epoch": 0.5873412851096576, + "flos": 747296824320.0, + "grad_norm": 0.06975664982977658, + "language_loss": 0.85649264, + "learning_rate": 0.00038392873151121994, + "loss": 0.86741114, + "num_input_tokens_seen": 254683664, + "router_z_loss_mlp": 0.16003418, + "step": 3053, + "time_per_iteration": 3.0498745441436768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094641, + "balance_loss_mlp": 1.07848823, + "epoch": 0.5875336667949211, + "flos": 528142477824.0, + "grad_norm": 0.07594371919491524, + "language_loss": 0.82729709, + "learning_rate": 0.0003836257225340859, + "loss": 0.83824348, + "num_input_tokens_seen": 254754688, + "router_z_loss_mlp": 0.16149902, + "step": 3054, + "time_per_iteration": 2.6312718391418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083342, + "balance_loss_mlp": 1.0662595, + "epoch": 0.5877260484801847, + "flos": 824166586368.0, + "grad_norm": 0.07226211151265562, + "language_loss": 0.81785333, + "learning_rate": 0.00038332275873745336, + "loss": 0.82868683, + "num_input_tokens_seen": 254838976, + "router_z_loss_mlp": 0.17102051, + "step": 3055, + "time_per_iteration": 3.0953447818756104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086607, + "balance_loss_mlp": 1.06990623, + "epoch": 0.5879184301654482, + "flos": 591598162944.0, + "grad_norm": 0.05891266503615663, + "language_loss": 0.82779503, + "learning_rate": 0.0003830198402389431, + "loss": 0.83866107, + "num_input_tokens_seen": 254912912, + "router_z_loss_mlp": 0.16711426, + "step": 3056, + "time_per_iteration": 2.7385828495025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022532, + "balance_loss_mlp": 1.01485538, + "epoch": 0.5881108118507118, + "flos": 1545805513728.0, + "grad_norm": 0.023195211062617696, + "language_loss": 0.77348936, + "learning_rate": 0.0003827169671561585, + "loss": 0.78371465, + "num_input_tokens_seen": 255151488, + "router_z_loss_mlp": 0.07666016, + "step": 3057, + "time_per_iteration": 5.0122692584991455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082378, + "balance_loss_mlp": 1.06487858, + "epoch": 0.5883031935359754, + "flos": 489597654528.0, + "grad_norm": 0.09420327310468278, + "language_loss": 0.82856947, + "learning_rate": 0.0003824141396066855, + "loss": 0.83939326, + "num_input_tokens_seen": 255218896, + "router_z_loss_mlp": 0.17504883, + "step": 3058, + "time_per_iteration": 2.630829334259033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086117, + "balance_loss_mlp": 1.06941545, + "epoch": 0.588495575221239, + "flos": 582836539392.0, + "grad_norm": 0.07561205741670568, + "language_loss": 0.82764673, + "learning_rate": 0.000382111357708092, + "loss": 0.83850795, + "num_input_tokens_seen": 255287408, + "router_z_loss_mlp": 0.16711426, + "step": 3059, + "time_per_iteration": 2.754732608795166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079118, + "balance_loss_mlp": 1.06203532, + "epoch": 0.5886879569065026, + "flos": 661048003584.0, + "grad_norm": 0.07214212654246877, + "language_loss": 0.83606875, + "learning_rate": 0.00038180862157792864, + "loss": 0.84685993, + "num_input_tokens_seen": 255358432, + "router_z_loss_mlp": 0.17102051, + "step": 3060, + "time_per_iteration": 2.8452963829040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079154, + "balance_loss_mlp": 1.06195176, + "epoch": 0.588880338591766, + "flos": 562657425408.0, + "grad_norm": 0.06766423660124334, + "language_loss": 0.81912309, + "learning_rate": 0.0003815059313337279, + "loss": 0.82991457, + "num_input_tokens_seen": 255425744, + "router_z_loss_mlp": 0.17224121, + "step": 3061, + "time_per_iteration": 2.699923515319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075885, + "balance_loss_mlp": 1.05862319, + "epoch": 0.5890727202770296, + "flos": 554730923520.0, + "grad_norm": 0.05609969141419105, + "language_loss": 0.78319967, + "learning_rate": 0.00038120328709300436, + "loss": 0.79395854, + "num_input_tokens_seen": 255505808, + "router_z_loss_mlp": 0.17272949, + "step": 3062, + "time_per_iteration": 2.9140214920043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073381, + "balance_loss_mlp": 1.05580938, + "epoch": 0.5892651019622932, + "flos": 655520781312.0, + "grad_norm": 0.06388746068798092, + "language_loss": 0.83677167, + "learning_rate": 0.0003809006889732549, + "loss": 0.84750545, + "num_input_tokens_seen": 255580160, + "router_z_loss_mlp": 0.17590332, + "step": 3063, + "time_per_iteration": 2.812375068664551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073036, + "balance_loss_mlp": 1.05551219, + "epoch": 0.5894574836475568, + "flos": 453202490880.0, + "grad_norm": 0.1840205152254721, + "language_loss": 0.87883544, + "learning_rate": 0.0003805981370919589, + "loss": 0.88956577, + "num_input_tokens_seen": 255644016, + "router_z_loss_mlp": 0.17529297, + "step": 3064, + "time_per_iteration": 2.5644187927246094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073964, + "balance_loss_mlp": 1.05604672, + "epoch": 0.5896498653328203, + "flos": 519032489472.0, + "grad_norm": 0.08741335688742048, + "language_loss": 0.83813435, + "learning_rate": 0.0003802956315665771, + "loss": 0.84887397, + "num_input_tokens_seen": 255718192, + "router_z_loss_mlp": 0.17932129, + "step": 3065, + "time_per_iteration": 2.6985597610473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077561, + "balance_loss_mlp": 1.0604192, + "epoch": 0.5898422470180839, + "flos": 549050628096.0, + "grad_norm": 0.09549414349914971, + "language_loss": 0.81565332, + "learning_rate": 0.0003799931725145529, + "loss": 0.82642901, + "num_input_tokens_seen": 255787696, + "router_z_loss_mlp": 0.17150879, + "step": 3066, + "time_per_iteration": 2.621553897857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079439, + "balance_loss_mlp": 1.06172466, + "epoch": 0.5900346287033474, + "flos": 524312663040.0, + "grad_norm": 0.06470265589627064, + "language_loss": 0.85731423, + "learning_rate": 0.00037969076005331083, + "loss": 0.86810863, + "num_input_tokens_seen": 255862992, + "router_z_loss_mlp": 0.17736816, + "step": 3067, + "time_per_iteration": 2.7705938816070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108525, + "balance_loss_mlp": 1.06776178, + "epoch": 0.590227010388611, + "flos": 567156805632.0, + "grad_norm": 0.07323535980547291, + "language_loss": 0.87987936, + "learning_rate": 0.00037938839430025817, + "loss": 0.89073181, + "num_input_tokens_seen": 255931872, + "router_z_loss_mlp": 0.17504883, + "step": 3068, + "time_per_iteration": 2.6688857078552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085401, + "balance_loss_mlp": 1.06792498, + "epoch": 0.5904193920738746, + "flos": 583333208064.0, + "grad_norm": 0.13096377841439616, + "language_loss": 0.85380679, + "learning_rate": 0.0003790860753727835, + "loss": 0.86466074, + "num_input_tokens_seen": 256004656, + "router_z_loss_mlp": 0.17492676, + "step": 3069, + "time_per_iteration": 2.9018454551696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091061, + "balance_loss_mlp": 1.07345426, + "epoch": 0.5906117737591381, + "flos": 529701493248.0, + "grad_norm": 0.0726049430242405, + "language_loss": 0.82249814, + "learning_rate": 0.00037878380338825766, + "loss": 0.83340883, + "num_input_tokens_seen": 256076944, + "router_z_loss_mlp": 0.1763916, + "step": 3070, + "time_per_iteration": 2.695953607559204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095711, + "balance_loss_mlp": 1.07847357, + "epoch": 0.5908041554444017, + "flos": 684229151232.0, + "grad_norm": 0.07160608760806797, + "language_loss": 0.81351429, + "learning_rate": 0.00037848157846403287, + "loss": 0.82447141, + "num_input_tokens_seen": 256154768, + "router_z_loss_mlp": 0.17248535, + "step": 3071, + "time_per_iteration": 2.900130271911621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096895, + "balance_loss_mlp": 1.07976437, + "epoch": 0.5909965371296653, + "flos": 550001746944.0, + "grad_norm": 0.08831271669304017, + "language_loss": 0.83602202, + "learning_rate": 0.0003781794007174435, + "loss": 0.846991, + "num_input_tokens_seen": 256230896, + "router_z_loss_mlp": 0.17150879, + "step": 3072, + "time_per_iteration": 2.7315585613250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052839, + "balance_loss_mlp": 1.0453527, + "epoch": 0.5911889188149289, + "flos": 1492361750016.0, + "grad_norm": 0.018548344346269084, + "language_loss": 0.74074531, + "learning_rate": 0.0003778772702658051, + "loss": 0.75127375, + "num_input_tokens_seen": 256462336, + "router_z_loss_mlp": 0.07470703, + "step": 3073, + "time_per_iteration": 4.843595027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096296, + "balance_loss_mlp": 1.07984531, + "epoch": 0.5913813005001923, + "flos": 487880423424.0, + "grad_norm": 0.06605464812454943, + "language_loss": 0.80771315, + "learning_rate": 0.0003775751872264152, + "loss": 0.81867611, + "num_input_tokens_seen": 256539376, + "router_z_loss_mlp": 0.16455078, + "step": 3074, + "time_per_iteration": 2.812434196472168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088175, + "balance_loss_mlp": 1.07113981, + "epoch": 0.5915736821854559, + "flos": 573331198464.0, + "grad_norm": 0.08890011139795934, + "language_loss": 0.86803812, + "learning_rate": 0.0003772731517165527, + "loss": 0.87891984, + "num_input_tokens_seen": 256617728, + "router_z_loss_mlp": 0.17041016, + "step": 3075, + "time_per_iteration": 2.8199949264526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087435, + "balance_loss_mlp": 1.07135379, + "epoch": 0.5917660638707195, + "flos": 789518389248.0, + "grad_norm": 0.06956331546073297, + "language_loss": 0.83378977, + "learning_rate": 0.0003769711638534784, + "loss": 0.8446641, + "num_input_tokens_seen": 256696032, + "router_z_loss_mlp": 0.16064453, + "step": 3076, + "time_per_iteration": 3.021451711654663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097469, + "balance_loss_mlp": 1.08068419, + "epoch": 0.5919584455559831, + "flos": 528740462592.0, + "grad_norm": 0.07608235771804774, + "language_loss": 0.79065943, + "learning_rate": 0.00037666922375443446, + "loss": 0.80163419, + "num_input_tokens_seen": 256767360, + "router_z_loss_mlp": 0.16796875, + "step": 3077, + "time_per_iteration": 2.602043867111206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092109, + "balance_loss_mlp": 1.076123, + "epoch": 0.5921508272412467, + "flos": 560606510592.0, + "grad_norm": 0.09346086613563626, + "language_loss": 0.81744075, + "learning_rate": 0.00037636733153664396, + "loss": 0.82836187, + "num_input_tokens_seen": 256844848, + "router_z_loss_mlp": 0.15979004, + "step": 3078, + "time_per_iteration": 2.8222453594207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093493, + "balance_loss_mlp": 1.07719743, + "epoch": 0.5923432089265102, + "flos": 563272662528.0, + "grad_norm": 0.1116363853226753, + "language_loss": 0.79912782, + "learning_rate": 0.0003760654873173124, + "loss": 0.81006277, + "num_input_tokens_seen": 256916688, + "router_z_loss_mlp": 0.16296387, + "step": 3079, + "time_per_iteration": 2.6946070194244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085907, + "balance_loss_mlp": 1.06951547, + "epoch": 0.5925355906117737, + "flos": 495740113920.0, + "grad_norm": 0.06915984482876121, + "language_loss": 0.81859291, + "learning_rate": 0.00037576369121362566, + "loss": 0.82945192, + "num_input_tokens_seen": 256985520, + "router_z_loss_mlp": 0.16394043, + "step": 3080, + "time_per_iteration": 2.6502840518951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088263, + "balance_loss_mlp": 1.07191896, + "epoch": 0.5927279722970373, + "flos": 566249730048.0, + "grad_norm": 0.07693331015944839, + "language_loss": 0.8159368, + "learning_rate": 0.0003754619433427516, + "loss": 0.82681942, + "num_input_tokens_seen": 257067552, + "router_z_loss_mlp": 0.16345215, + "step": 3081, + "time_per_iteration": 2.9385058879852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084208, + "balance_loss_mlp": 1.06749439, + "epoch": 0.5929203539823009, + "flos": 666970578432.0, + "grad_norm": 0.07095697248954357, + "language_loss": 0.77517045, + "learning_rate": 0.0003751602438218392, + "loss": 0.78601247, + "num_input_tokens_seen": 257138896, + "router_z_loss_mlp": 0.16723633, + "step": 3082, + "time_per_iteration": 2.8245561122894287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083121, + "balance_loss_mlp": 1.06693244, + "epoch": 0.5931127356675644, + "flos": 555744084480.0, + "grad_norm": 0.1021077750392874, + "language_loss": 0.83509332, + "learning_rate": 0.0003748585927680186, + "loss": 0.8459245, + "num_input_tokens_seen": 257210592, + "router_z_loss_mlp": 0.16186523, + "step": 3083, + "time_per_iteration": 2.6818346977233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084171, + "balance_loss_mlp": 1.06721938, + "epoch": 0.593305117352828, + "flos": 535194210816.0, + "grad_norm": 0.06846862154983226, + "language_loss": 0.82637662, + "learning_rate": 0.00037455699029840086, + "loss": 0.83721828, + "num_input_tokens_seen": 257276208, + "router_z_loss_mlp": 0.16967773, + "step": 3084, + "time_per_iteration": 2.6860570907592773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088457, + "balance_loss_mlp": 1.07176781, + "epoch": 0.5934974990380916, + "flos": 593957795328.0, + "grad_norm": 0.06710726384898401, + "language_loss": 0.8462739, + "learning_rate": 0.0003742554365300787, + "loss": 0.85715848, + "num_input_tokens_seen": 257351920, + "router_z_loss_mlp": 0.16699219, + "step": 3085, + "time_per_iteration": 2.749816656112671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088228, + "balance_loss_mlp": 1.07143116, + "epoch": 0.5936898807233552, + "flos": 712673220096.0, + "grad_norm": 0.08250802724924795, + "language_loss": 0.78595787, + "learning_rate": 0.0003739539315801255, + "loss": 0.79684019, + "num_input_tokens_seen": 257430016, + "router_z_loss_mlp": 0.16809082, + "step": 3086, + "time_per_iteration": 2.982919216156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092705, + "balance_loss_mlp": 1.07571757, + "epoch": 0.5938822624086187, + "flos": 391896465408.0, + "grad_norm": 0.083760246794696, + "language_loss": 0.91647482, + "learning_rate": 0.000373652475565596, + "loss": 0.9274019, + "num_input_tokens_seen": 257492224, + "router_z_loss_mlp": 0.16992188, + "step": 3087, + "time_per_iteration": 2.4816558361053467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102413, + "balance_loss_mlp": 1.08528244, + "epoch": 0.5940746440938822, + "flos": 480285033984.0, + "grad_norm": 0.09245346089356003, + "language_loss": 0.81352496, + "learning_rate": 0.00037335106860352587, + "loss": 0.82454908, + "num_input_tokens_seen": 257567824, + "router_z_loss_mlp": 0.17138672, + "step": 3088, + "time_per_iteration": 2.675565719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.07863212, + "epoch": 0.5942670257791458, + "flos": 483336626688.0, + "grad_norm": 0.10172018328041595, + "language_loss": 0.83090484, + "learning_rate": 0.00037304971081093146, + "loss": 0.84185594, + "num_input_tokens_seen": 257635488, + "router_z_loss_mlp": 0.16479492, + "step": 3089, + "time_per_iteration": 2.614063024520874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102174, + "balance_loss_mlp": 1.08573484, + "epoch": 0.5944594074644094, + "flos": 547936151040.0, + "grad_norm": 0.09417550180705583, + "language_loss": 0.81048489, + "learning_rate": 0.00037274840230481024, + "loss": 0.82150662, + "num_input_tokens_seen": 257709552, + "router_z_loss_mlp": 0.16442871, + "step": 3090, + "time_per_iteration": 2.791287899017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106483, + "balance_loss_mlp": 1.09013939, + "epoch": 0.594651789149673, + "flos": 449179955712.0, + "grad_norm": 0.08210045649904979, + "language_loss": 0.79059577, + "learning_rate": 0.00037244714320214077, + "loss": 0.80166066, + "num_input_tokens_seen": 257775520, + "router_z_loss_mlp": 0.16345215, + "step": 3091, + "time_per_iteration": 2.5437703132629395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101492, + "balance_loss_mlp": 1.08511281, + "epoch": 0.5948441708349365, + "flos": 596267868672.0, + "grad_norm": 0.06960715408232113, + "language_loss": 0.83210528, + "learning_rate": 0.000372145933619882, + "loss": 0.84312022, + "num_input_tokens_seen": 257858560, + "router_z_loss_mlp": 0.16381836, + "step": 3092, + "time_per_iteration": 2.902186155319214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112879, + "balance_loss_mlp": 1.0964278, + "epoch": 0.5950365525202, + "flos": 548516883456.0, + "grad_norm": 0.11673775861228046, + "language_loss": 0.82268316, + "learning_rate": 0.000371844773674974, + "loss": 0.833812, + "num_input_tokens_seen": 257928048, + "router_z_loss_mlp": 0.16455078, + "step": 3093, + "time_per_iteration": 2.6614809036254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116363, + "balance_loss_mlp": 1.10023379, + "epoch": 0.5952289342054636, + "flos": 654700340736.0, + "grad_norm": 0.0944691086002383, + "language_loss": 0.81785637, + "learning_rate": 0.0003715436634843375, + "loss": 0.82902002, + "num_input_tokens_seen": 258003088, + "router_z_loss_mlp": 0.16125488, + "step": 3094, + "time_per_iteration": 2.90022873878479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117363, + "balance_loss_mlp": 1.10172296, + "epoch": 0.5954213158907272, + "flos": 603364018176.0, + "grad_norm": 0.057224396595454204, + "language_loss": 0.80872512, + "learning_rate": 0.00037124260316487355, + "loss": 0.81989878, + "num_input_tokens_seen": 258084880, + "router_z_loss_mlp": 0.15625, + "step": 3095, + "time_per_iteration": 2.885049819946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114273, + "balance_loss_mlp": 1.09841847, + "epoch": 0.5956136975759908, + "flos": 486331319808.0, + "grad_norm": 0.06086987109203959, + "language_loss": 0.89374322, + "learning_rate": 0.0003709415928334643, + "loss": 0.90488601, + "num_input_tokens_seen": 258152032, + "router_z_loss_mlp": 0.15844727, + "step": 3096, + "time_per_iteration": 2.6082546710968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011177, + "balance_loss_mlp": 1.10172629, + "epoch": 0.5958060792612543, + "flos": 658777204224.0, + "grad_norm": 0.09348672972793858, + "language_loss": 0.80559552, + "learning_rate": 0.00037064063260697233, + "loss": 0.81677252, + "num_input_tokens_seen": 258228896, + "router_z_loss_mlp": 0.15966797, + "step": 3097, + "time_per_iteration": 2.8901162147521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123233, + "balance_loss_mlp": 1.10749698, + "epoch": 0.5959984609465179, + "flos": 723559537152.0, + "grad_norm": 0.06876216438303968, + "language_loss": 0.78693187, + "learning_rate": 0.0003703397226022407, + "loss": 0.79816419, + "num_input_tokens_seen": 258311152, + "router_z_loss_mlp": 0.15722656, + "step": 3098, + "time_per_iteration": 3.066073179244995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102465, + "balance_loss_mlp": 1.09416783, + "epoch": 0.5961908426317815, + "flos": 1519849557504.0, + "grad_norm": 0.03442912107402327, + "language_loss": 0.75499874, + "learning_rate": 0.00037003886293609335, + "loss": 0.7660234, + "num_input_tokens_seen": 258540656, + "router_z_loss_mlp": 0.08300781, + "step": 3099, + "time_per_iteration": 4.9653050899505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127115, + "balance_loss_mlp": 1.11136746, + "epoch": 0.596383224317045, + "flos": 532614693888.0, + "grad_norm": 0.0680420214228425, + "language_loss": 0.8297379, + "learning_rate": 0.0003697380537253339, + "loss": 0.84100908, + "num_input_tokens_seen": 258608960, + "router_z_loss_mlp": 0.15734863, + "step": 3100, + "time_per_iteration": 2.715177059173584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.0978117, + "epoch": 0.5965756060023086, + "flos": 591210150912.0, + "grad_norm": 0.06669871573577384, + "language_loss": 0.81245238, + "learning_rate": 0.0003694372950867471, + "loss": 0.82358712, + "num_input_tokens_seen": 258684304, + "router_z_loss_mlp": 0.15649414, + "step": 3101, + "time_per_iteration": 2.8005011081695557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123061, + "balance_loss_mlp": 1.10731363, + "epoch": 0.5967679876875721, + "flos": 862054327296.0, + "grad_norm": 0.07790109934746459, + "language_loss": 0.77269602, + "learning_rate": 0.0003691365871370976, + "loss": 0.78392667, + "num_input_tokens_seen": 258769472, + "router_z_loss_mlp": 0.15734863, + "step": 3102, + "time_per_iteration": 3.077610731124878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118239, + "balance_loss_mlp": 1.10267067, + "epoch": 0.5969603693728357, + "flos": 553834132992.0, + "grad_norm": 0.06403529919974375, + "language_loss": 0.85239542, + "learning_rate": 0.00036883592999313093, + "loss": 0.86357784, + "num_input_tokens_seen": 258841696, + "router_z_loss_mlp": 0.15551758, + "step": 3103, + "time_per_iteration": 2.6910035610198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123035, + "balance_loss_mlp": 1.10726357, + "epoch": 0.5971527510580993, + "flos": 718662606336.0, + "grad_norm": 0.07439514059918453, + "language_loss": 0.7913959, + "learning_rate": 0.0003685353237715722, + "loss": 0.80262625, + "num_input_tokens_seen": 258915616, + "router_z_loss_mlp": 0.15759277, + "step": 3104, + "time_per_iteration": 2.8957912921905518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118284, + "balance_loss_mlp": 1.10222602, + "epoch": 0.5973451327433629, + "flos": 647631355392.0, + "grad_norm": 0.09765250688336868, + "language_loss": 0.81377506, + "learning_rate": 0.0003682347685891274, + "loss": 0.82495785, + "num_input_tokens_seen": 258994080, + "router_z_loss_mlp": 0.16052246, + "step": 3105, + "time_per_iteration": 2.84584379196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106359, + "balance_loss_mlp": 1.09007454, + "epoch": 0.5975375144286263, + "flos": 721716397056.0, + "grad_norm": 0.07268165375697674, + "language_loss": 0.805511, + "learning_rate": 0.0003679342645624822, + "loss": 0.81657457, + "num_input_tokens_seen": 259075968, + "router_z_loss_mlp": 0.1628418, + "step": 3106, + "time_per_iteration": 3.0009236335754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116176, + "balance_loss_mlp": 1.09978509, + "epoch": 0.5977298961138899, + "flos": 750961082880.0, + "grad_norm": 0.08276382082752762, + "language_loss": 0.81614435, + "learning_rate": 0.0003676338118083025, + "loss": 0.82730609, + "num_input_tokens_seen": 259162512, + "router_z_loss_mlp": 0.16394043, + "step": 3107, + "time_per_iteration": 3.088297128677368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103194, + "balance_loss_mlp": 1.08736336, + "epoch": 0.5979222777991535, + "flos": 530961702912.0, + "grad_norm": 0.10722680659176895, + "language_loss": 0.79196644, + "learning_rate": 0.0003673334104432347, + "loss": 0.80299842, + "num_input_tokens_seen": 259228752, + "router_z_loss_mlp": 0.15820312, + "step": 3108, + "time_per_iteration": 2.634643077850342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100064, + "balance_loss_mlp": 1.08379245, + "epoch": 0.5981146594844171, + "flos": 621749551104.0, + "grad_norm": 0.07294397192010518, + "language_loss": 0.8350544, + "learning_rate": 0.0003670330605839048, + "loss": 0.84605503, + "num_input_tokens_seen": 259303440, + "router_z_loss_mlp": 0.16271973, + "step": 3109, + "time_per_iteration": 2.8294010162353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091507, + "balance_loss_mlp": 1.0755446, + "epoch": 0.5983070411696807, + "flos": 603589045248.0, + "grad_norm": 0.08059004302640393, + "language_loss": 0.76664943, + "learning_rate": 0.0003667327623469191, + "loss": 0.77756447, + "num_input_tokens_seen": 259378752, + "router_z_loss_mlp": 0.1595459, + "step": 3110, + "time_per_iteration": 2.784902334213257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100362, + "balance_loss_mlp": 1.084126, + "epoch": 0.5984994228549442, + "flos": 633483472896.0, + "grad_norm": 0.07319281645054936, + "language_loss": 0.77725756, + "learning_rate": 0.00036643251584886333, + "loss": 0.78826118, + "num_input_tokens_seen": 259454336, + "router_z_loss_mlp": 0.16235352, + "step": 3111, + "time_per_iteration": 2.795421838760376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100904, + "balance_loss_mlp": 1.08444118, + "epoch": 0.5986918045402078, + "flos": 525278836224.0, + "grad_norm": 0.07234799336755846, + "language_loss": 0.8192088, + "learning_rate": 0.00036613232120630393, + "loss": 0.83021784, + "num_input_tokens_seen": 259518960, + "router_z_loss_mlp": 0.16467285, + "step": 3112, + "time_per_iteration": 2.6119191646575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07855165, + "epoch": 0.5988841862254713, + "flos": 483180982272.0, + "grad_norm": 0.1220679262263155, + "language_loss": 0.7997117, + "learning_rate": 0.00036583217853578643, + "loss": 0.81066352, + "num_input_tokens_seen": 259584352, + "router_z_loss_mlp": 0.16638184, + "step": 3113, + "time_per_iteration": 2.5559191703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095031, + "balance_loss_mlp": 1.07856846, + "epoch": 0.5990765679107349, + "flos": 1140149924352.0, + "grad_norm": 0.06954821000435275, + "language_loss": 0.77413309, + "learning_rate": 0.000365532087953837, + "loss": 0.78508341, + "num_input_tokens_seen": 259693152, + "router_z_loss_mlp": 0.16467285, + "step": 3114, + "time_per_iteration": 3.6444194316864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093034, + "balance_loss_mlp": 1.07666647, + "epoch": 0.5992689495959984, + "flos": 516986717184.0, + "grad_norm": 0.07355388338928669, + "language_loss": 0.89153886, + "learning_rate": 0.00036523204957696065, + "loss": 0.90246928, + "num_input_tokens_seen": 259762048, + "router_z_loss_mlp": 0.16369629, + "step": 3115, + "time_per_iteration": 2.6114542484283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090631, + "balance_loss_mlp": 1.07385826, + "epoch": 0.599461331281262, + "flos": 744618562560.0, + "grad_norm": 0.06661163617003031, + "language_loss": 0.80990088, + "learning_rate": 0.00036493206352164324, + "loss": 0.82080722, + "num_input_tokens_seen": 259843184, + "router_z_loss_mlp": 0.16784668, + "step": 3116, + "time_per_iteration": 2.977773666381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099589, + "balance_loss_mlp": 1.08299482, + "epoch": 0.5996537129665256, + "flos": 592359132672.0, + "grad_norm": 0.06605770678363264, + "language_loss": 0.85320091, + "learning_rate": 0.000364632129904349, + "loss": 0.86419678, + "num_input_tokens_seen": 259912720, + "router_z_loss_mlp": 0.16601562, + "step": 3117, + "time_per_iteration": 2.7504782676696777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109875, + "balance_loss_mlp": 1.08246565, + "epoch": 0.5998460946517892, + "flos": 559010419200.0, + "grad_norm": 0.07896925435607946, + "language_loss": 0.78125691, + "learning_rate": 0.00036433224884152283, + "loss": 0.79224437, + "num_input_tokens_seen": 259985472, + "router_z_loss_mlp": 0.1628418, + "step": 3118, + "time_per_iteration": 2.762640953063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106506, + "balance_loss_mlp": 1.09019828, + "epoch": 0.6000384763370528, + "flos": 484567100928.0, + "grad_norm": 0.08654027448722386, + "language_loss": 0.77639025, + "learning_rate": 0.00036403242044958875, + "loss": 0.78745532, + "num_input_tokens_seen": 260050336, + "router_z_loss_mlp": 0.16308594, + "step": 3119, + "time_per_iteration": 2.590341567993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105329, + "balance_loss_mlp": 1.08873463, + "epoch": 0.6002308580223162, + "flos": 596767108608.0, + "grad_norm": 0.12490963722323402, + "language_loss": 0.91469646, + "learning_rate": 0.0003637326448449507, + "loss": 0.92574978, + "num_input_tokens_seen": 260120304, + "router_z_loss_mlp": 0.16601562, + "step": 3120, + "time_per_iteration": 2.757040500640869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114298, + "balance_loss_mlp": 1.09782338, + "epoch": 0.6004232397075798, + "flos": 545146661376.0, + "grad_norm": 0.07048281834234121, + "language_loss": 0.85906887, + "learning_rate": 0.00036343292214399177, + "loss": 0.87021184, + "num_input_tokens_seen": 260198304, + "router_z_loss_mlp": 0.16479492, + "step": 3121, + "time_per_iteration": 2.7731616497039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110866, + "balance_loss_mlp": 1.09368825, + "epoch": 0.6006156213928434, + "flos": 629947694592.0, + "grad_norm": 0.08856935015061373, + "language_loss": 0.77217454, + "learning_rate": 0.00036313325246307456, + "loss": 0.78328323, + "num_input_tokens_seen": 260277664, + "router_z_loss_mlp": 0.171875, + "step": 3122, + "time_per_iteration": 2.8254263401031494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107144, + "balance_loss_mlp": 1.0897516, + "epoch": 0.600808003078107, + "flos": 582315277824.0, + "grad_norm": 0.07082824318872671, + "language_loss": 0.87116647, + "learning_rate": 0.0003628336359185411, + "loss": 0.88223791, + "num_input_tokens_seen": 260350096, + "router_z_loss_mlp": 0.17419434, + "step": 3123, + "time_per_iteration": 2.6960785388946533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104705, + "balance_loss_mlp": 1.08815873, + "epoch": 0.6010003847633705, + "flos": 635274855936.0, + "grad_norm": 0.09352377906746982, + "language_loss": 0.75570095, + "learning_rate": 0.000362534072626713, + "loss": 0.76674795, + "num_input_tokens_seen": 260421888, + "router_z_loss_mlp": 0.16552734, + "step": 3124, + "time_per_iteration": 2.7963545322418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094202, + "balance_loss_mlp": 1.07738113, + "epoch": 0.6011927664486341, + "flos": 718763922432.0, + "grad_norm": 0.08561674190647896, + "language_loss": 0.81475127, + "learning_rate": 0.00036223456270389093, + "loss": 0.82569331, + "num_input_tokens_seen": 260499616, + "router_z_loss_mlp": 0.16833496, + "step": 3125, + "time_per_iteration": 2.992478609085083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085838, + "balance_loss_mlp": 1.06857657, + "epoch": 0.6013851481338977, + "flos": 499036184064.0, + "grad_norm": 0.08087477259987003, + "language_loss": 0.80765188, + "learning_rate": 0.00036193510626635517, + "loss": 0.81851029, + "num_input_tokens_seen": 260572048, + "router_z_loss_mlp": 0.17272949, + "step": 3126, + "time_per_iteration": 2.6718900203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.05972588, + "epoch": 0.6015775298191612, + "flos": 749587447296.0, + "grad_norm": 0.08853778728712877, + "language_loss": 0.81355464, + "learning_rate": 0.0003616357034303649, + "loss": 0.82432842, + "num_input_tokens_seen": 260644720, + "router_z_loss_mlp": 0.17663574, + "step": 3127, + "time_per_iteration": 2.9547274112701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075422, + "balance_loss_mlp": 1.05762434, + "epoch": 0.6017699115044248, + "flos": 593063202816.0, + "grad_norm": 0.1711605115844366, + "language_loss": 0.78441834, + "learning_rate": 0.0003613363543121584, + "loss": 0.79517257, + "num_input_tokens_seen": 260724864, + "router_z_loss_mlp": 0.17810059, + "step": 3128, + "time_per_iteration": 2.886970281600952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_mlp": 1.04813766, + "epoch": 0.6019622931896883, + "flos": 515111270400.0, + "grad_norm": 0.08758734410380958, + "language_loss": 0.85043442, + "learning_rate": 0.00036103705902795357, + "loss": 0.86108834, + "num_input_tokens_seen": 260800896, + "router_z_loss_mlp": 0.17260742, + "step": 3129, + "time_per_iteration": 2.748079776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072993, + "balance_loss_mlp": 1.0555644, + "epoch": 0.6021546748749519, + "flos": 490469852160.0, + "grad_norm": 0.09694707916442274, + "language_loss": 0.7971251, + "learning_rate": 0.0003607378176939471, + "loss": 0.80785501, + "num_input_tokens_seen": 260872736, + "router_z_loss_mlp": 0.17443848, + "step": 3130, + "time_per_iteration": 2.6402640342712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069092, + "balance_loss_mlp": 1.05256987, + "epoch": 0.6023470565602155, + "flos": 541032721920.0, + "grad_norm": 0.08416157217627585, + "language_loss": 0.82138842, + "learning_rate": 0.00036043863042631465, + "loss": 0.83207935, + "num_input_tokens_seen": 260943264, + "router_z_loss_mlp": 0.1652832, + "step": 3131, + "time_per_iteration": 2.679304838180542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069317, + "balance_loss_mlp": 1.05229378, + "epoch": 0.6025394382454791, + "flos": 845020408320.0, + "grad_norm": 0.08544531393878185, + "language_loss": 0.76554382, + "learning_rate": 0.00036013949734121133, + "loss": 0.77623701, + "num_input_tokens_seen": 261030064, + "router_z_loss_mlp": 0.17028809, + "step": 3132, + "time_per_iteration": 3.1334645748138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071138, + "balance_loss_mlp": 1.05466342, + "epoch": 0.6027318199307425, + "flos": 577173496320.0, + "grad_norm": 0.08104461370045753, + "language_loss": 0.82059807, + "learning_rate": 0.00035984041855477043, + "loss": 0.8313095, + "num_input_tokens_seen": 261106496, + "router_z_loss_mlp": 0.16467285, + "step": 3133, + "time_per_iteration": 2.7347941398620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01045627, + "balance_loss_mlp": 1.03842688, + "epoch": 0.6029242016160061, + "flos": 1470976754688.0, + "grad_norm": 0.025003389778794672, + "language_loss": 0.78709894, + "learning_rate": 0.00035954139418310495, + "loss": 0.79755521, + "num_input_tokens_seen": 261343248, + "router_z_loss_mlp": 0.07177734, + "step": 3134, + "time_per_iteration": 4.970470428466797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076232, + "balance_loss_mlp": 1.05934048, + "epoch": 0.6031165833012697, + "flos": 480744626688.0, + "grad_norm": 0.07365504722099776, + "language_loss": 0.79866755, + "learning_rate": 0.00035924242434230637, + "loss": 0.80942982, + "num_input_tokens_seen": 261416704, + "router_z_loss_mlp": 0.16906738, + "step": 3135, + "time_per_iteration": 2.7135050296783447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107956, + "balance_loss_mlp": 1.06296587, + "epoch": 0.6033089649865333, + "flos": 499468612608.0, + "grad_norm": 0.08294049229736823, + "language_loss": 0.78440452, + "learning_rate": 0.00035894350914844516, + "loss": 0.79520017, + "num_input_tokens_seen": 261486688, + "router_z_loss_mlp": 0.16601562, + "step": 3136, + "time_per_iteration": 2.6597416400909424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079884, + "balance_loss_mlp": 1.06325424, + "epoch": 0.6035013466717969, + "flos": 556613710848.0, + "grad_norm": 0.08267470686196479, + "language_loss": 0.83196414, + "learning_rate": 0.0003586446487175703, + "loss": 0.84276295, + "num_input_tokens_seen": 261557344, + "router_z_loss_mlp": 0.16638184, + "step": 3137, + "time_per_iteration": 2.7022488117218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084641, + "balance_loss_mlp": 1.06798732, + "epoch": 0.6036937283570604, + "flos": 594827421696.0, + "grad_norm": 0.064575038850489, + "language_loss": 0.85214019, + "learning_rate": 0.0003583458431657099, + "loss": 0.86298662, + "num_input_tokens_seen": 261626240, + "router_z_loss_mlp": 0.16662598, + "step": 3138, + "time_per_iteration": 2.7720208168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084009, + "balance_loss_mlp": 1.06771302, + "epoch": 0.603886110042324, + "flos": 540958569984.0, + "grad_norm": 0.09877124262847642, + "language_loss": 0.82838678, + "learning_rate": 0.00035804709260887056, + "loss": 0.83922684, + "num_input_tokens_seen": 261696368, + "router_z_loss_mlp": 0.16296387, + "step": 3139, + "time_per_iteration": 2.6879312992095947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086877, + "balance_loss_mlp": 1.07065237, + "epoch": 0.6040784917275875, + "flos": 518582808576.0, + "grad_norm": 0.07215366111763855, + "language_loss": 0.8912158, + "learning_rate": 0.0003577483971630373, + "loss": 0.90208459, + "num_input_tokens_seen": 261769104, + "router_z_loss_mlp": 0.16223145, + "step": 3140, + "time_per_iteration": 2.734809398651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085015, + "balance_loss_mlp": 1.06892204, + "epoch": 0.6042708734128511, + "flos": 660751395840.0, + "grad_norm": 0.05656780869347305, + "language_loss": 0.84707594, + "learning_rate": 0.00035744975694417414, + "loss": 0.85792601, + "num_input_tokens_seen": 261844880, + "router_z_loss_mlp": 0.16088867, + "step": 3141, + "time_per_iteration": 2.8830533027648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083849, + "balance_loss_mlp": 1.06837583, + "epoch": 0.6044632550981146, + "flos": 572330520576.0, + "grad_norm": 0.12103965495464937, + "language_loss": 0.82471883, + "learning_rate": 0.00035715117206822344, + "loss": 0.83555734, + "num_input_tokens_seen": 261923280, + "router_z_loss_mlp": 0.15454102, + "step": 3142, + "time_per_iteration": 2.838871479034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085944, + "balance_loss_mlp": 1.06989884, + "epoch": 0.6046556367833782, + "flos": 546681083904.0, + "grad_norm": 0.07532409559899438, + "language_loss": 0.80957747, + "learning_rate": 0.0003568526426511065, + "loss": 0.82043689, + "num_input_tokens_seen": 261990832, + "router_z_loss_mlp": 0.16040039, + "step": 3143, + "time_per_iteration": 2.646676540374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.07312953, + "epoch": 0.6048480184686418, + "flos": 776838117888.0, + "grad_norm": 0.09699368707048923, + "language_loss": 0.82747424, + "learning_rate": 0.000356554168808722, + "loss": 0.83836174, + "num_input_tokens_seen": 262063760, + "router_z_loss_mlp": 0.15612793, + "step": 3144, + "time_per_iteration": 2.9851598739624023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093552, + "balance_loss_mlp": 1.07773244, + "epoch": 0.6050404001539054, + "flos": 657144036864.0, + "grad_norm": 0.07251607714921615, + "language_loss": 0.84944451, + "learning_rate": 0.00035625575065694837, + "loss": 0.86037999, + "num_input_tokens_seen": 262137968, + "router_z_loss_mlp": 0.15808105, + "step": 3145, + "time_per_iteration": 2.8598599433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090889, + "balance_loss_mlp": 1.07443857, + "epoch": 0.605232781839169, + "flos": 548983816704.0, + "grad_norm": 0.07064458078135354, + "language_loss": 0.77895433, + "learning_rate": 0.0003559573883116415, + "loss": 0.78986323, + "num_input_tokens_seen": 262211264, + "router_z_loss_mlp": 0.16455078, + "step": 3146, + "time_per_iteration": 2.733262062072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089343, + "balance_loss_mlp": 1.07359576, + "epoch": 0.6054251635244324, + "flos": 605402449920.0, + "grad_norm": 0.07444440196123078, + "language_loss": 0.85480058, + "learning_rate": 0.00035565908188863604, + "loss": 0.86569399, + "num_input_tokens_seen": 262289648, + "router_z_loss_mlp": 0.15734863, + "step": 3147, + "time_per_iteration": 2.853851079940796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091792, + "balance_loss_mlp": 1.07599723, + "epoch": 0.605617545209696, + "flos": 613679887872.0, + "grad_norm": 0.06196180807513896, + "language_loss": 0.79582435, + "learning_rate": 0.00035536083150374464, + "loss": 0.80674225, + "num_input_tokens_seen": 262362704, + "router_z_loss_mlp": 0.15783691, + "step": 3148, + "time_per_iteration": 2.776559352874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_mlp": 1.03995907, + "epoch": 0.6058099268949596, + "flos": 1498301577216.0, + "grad_norm": 0.024337037001299088, + "language_loss": 0.74747956, + "learning_rate": 0.00035506263727275893, + "loss": 0.75795162, + "num_input_tokens_seen": 262596864, + "router_z_loss_mlp": 0.07226562, + "step": 3149, + "time_per_iteration": 4.840685129165649 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091535, + "balance_loss_mlp": 1.07552564, + "epoch": 0.6060023085802232, + "flos": 670476621312.0, + "grad_norm": 0.06209204496769419, + "language_loss": 0.85722816, + "learning_rate": 0.0003547644993114475, + "loss": 0.8681435, + "num_input_tokens_seen": 262671088, + "router_z_loss_mlp": 0.16003418, + "step": 3150, + "time_per_iteration": 2.8153529167175293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092959, + "balance_loss_mlp": 1.07712793, + "epoch": 0.6061946902654868, + "flos": 606168562176.0, + "grad_norm": 0.07176933512118068, + "language_loss": 0.79877794, + "learning_rate": 0.00035446641773555806, + "loss": 0.80970764, + "num_input_tokens_seen": 262743888, + "router_z_loss_mlp": 0.15820312, + "step": 3151, + "time_per_iteration": 2.757474184036255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094094, + "balance_loss_mlp": 1.0779295, + "epoch": 0.6063870719507503, + "flos": 557844185088.0, + "grad_norm": 0.10666232173403664, + "language_loss": 0.86817247, + "learning_rate": 0.000354168392660816, + "loss": 0.87911344, + "num_input_tokens_seen": 262819616, + "router_z_loss_mlp": 0.16162109, + "step": 3152, + "time_per_iteration": 2.7577521800994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093009, + "balance_loss_mlp": 1.07742882, + "epoch": 0.6065794536360138, + "flos": 557154796032.0, + "grad_norm": 0.06835832262029293, + "language_loss": 0.82626665, + "learning_rate": 0.0003538704242029252, + "loss": 0.83719671, + "num_input_tokens_seen": 262893984, + "router_z_loss_mlp": 0.15576172, + "step": 3153, + "time_per_iteration": 2.7824299335479736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096364, + "balance_loss_mlp": 1.08066463, + "epoch": 0.6067718353212774, + "flos": 690144385536.0, + "grad_norm": 0.07699381631687732, + "language_loss": 0.77828813, + "learning_rate": 0.0003535725124775672, + "loss": 0.7892518, + "num_input_tokens_seen": 262969648, + "router_z_loss_mlp": 0.15686035, + "step": 3154, + "time_per_iteration": 2.8603780269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101879, + "balance_loss_mlp": 1.085392, + "epoch": 0.606964217006541, + "flos": 521804726784.0, + "grad_norm": 0.06603606941894191, + "language_loss": 0.86388272, + "learning_rate": 0.00035327465760040126, + "loss": 0.87490153, + "num_input_tokens_seen": 263042048, + "router_z_loss_mlp": 0.16491699, + "step": 3155, + "time_per_iteration": 2.731767177581787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102466, + "balance_loss_mlp": 1.08700442, + "epoch": 0.6071565986918045, + "flos": 641555707392.0, + "grad_norm": 0.08742295718167487, + "language_loss": 0.84376252, + "learning_rate": 0.00035297685968706526, + "loss": 0.85478723, + "num_input_tokens_seen": 263108032, + "router_z_loss_mlp": 0.15441895, + "step": 3156, + "time_per_iteration": 2.7879996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099222, + "balance_loss_mlp": 1.08361709, + "epoch": 0.6073489803770681, + "flos": 560581917696.0, + "grad_norm": 0.07206801524938761, + "language_loss": 0.82717532, + "learning_rate": 0.00035267911885317454, + "loss": 0.83816749, + "num_input_tokens_seen": 263175184, + "router_z_loss_mlp": 0.15588379, + "step": 3157, + "time_per_iteration": 2.6752853393554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096892, + "balance_loss_mlp": 1.08108473, + "epoch": 0.6075413620623317, + "flos": 586088193024.0, + "grad_norm": 0.06913859395071588, + "language_loss": 0.81624317, + "learning_rate": 0.0003523814352143222, + "loss": 0.8272121, + "num_input_tokens_seen": 263252768, + "router_z_loss_mlp": 0.15795898, + "step": 3158, + "time_per_iteration": 2.851680040359497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096859, + "balance_loss_mlp": 1.08079004, + "epoch": 0.6077337437475953, + "flos": 630812551680.0, + "grad_norm": 0.07191756501085539, + "language_loss": 0.90879536, + "learning_rate": 0.00035208380888607937, + "loss": 0.91976392, + "num_input_tokens_seen": 263328720, + "router_z_loss_mlp": 0.16064453, + "step": 3159, + "time_per_iteration": 2.8229289054870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_mlp": 1.02311516, + "epoch": 0.6079261254328588, + "flos": 1468503696384.0, + "grad_norm": 0.017458667771122316, + "language_loss": 0.79461986, + "learning_rate": 0.000351786239983995, + "loss": 0.80492157, + "num_input_tokens_seen": 263554656, + "router_z_loss_mlp": 0.07080078, + "step": 3160, + "time_per_iteration": 4.860463619232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_mlp": 1.01992178, + "epoch": 0.6081185071181223, + "flos": 1523024861184.0, + "grad_norm": 0.015423076795417967, + "language_loss": 0.7569223, + "learning_rate": 0.00035148872862359517, + "loss": 0.76719207, + "num_input_tokens_seen": 263791600, + "router_z_loss_mlp": 0.07080078, + "step": 3161, + "time_per_iteration": 5.027961254119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090745, + "balance_loss_mlp": 1.07459164, + "epoch": 0.6083108888033859, + "flos": 556319674368.0, + "grad_norm": 0.06716496050507109, + "language_loss": 0.81388539, + "learning_rate": 0.00035119127492038446, + "loss": 0.82479286, + "num_input_tokens_seen": 263869744, + "router_z_loss_mlp": 0.16149902, + "step": 3162, + "time_per_iteration": 2.8567075729370117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_mlp": 1.07425177, + "epoch": 0.6085032704886495, + "flos": 841166000640.0, + "grad_norm": 0.07519938175586753, + "language_loss": 0.82571161, + "learning_rate": 0.00035089387898984436, + "loss": 0.83661485, + "num_input_tokens_seen": 263946624, + "router_z_loss_mlp": 0.16064453, + "step": 3163, + "time_per_iteration": 3.0894179344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093705, + "balance_loss_mlp": 1.07734919, + "epoch": 0.6086956521739131, + "flos": 684792631296.0, + "grad_norm": 0.07531226352360243, + "language_loss": 0.81800103, + "learning_rate": 0.0003505965409474343, + "loss": 0.82893807, + "num_input_tokens_seen": 264022064, + "router_z_loss_mlp": 0.16357422, + "step": 3164, + "time_per_iteration": 2.909203290939331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088443, + "balance_loss_mlp": 1.07221854, + "epoch": 0.6088880338591766, + "flos": 535799536128.0, + "grad_norm": 0.06350426788679164, + "language_loss": 0.86488736, + "learning_rate": 0.0003502992609085913, + "loss": 0.87577182, + "num_input_tokens_seen": 264089520, + "router_z_loss_mlp": 0.16223145, + "step": 3165, + "time_per_iteration": 2.6909096240997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087782, + "balance_loss_mlp": 1.07146227, + "epoch": 0.6090804155444401, + "flos": 731533026816.0, + "grad_norm": 0.0979130476844587, + "language_loss": 0.82205462, + "learning_rate": 0.00035000203898872954, + "loss": 0.83293247, + "num_input_tokens_seen": 264173056, + "router_z_loss_mlp": 0.16320801, + "step": 3166, + "time_per_iteration": 3.0287840366363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092824, + "balance_loss_mlp": 1.07664716, + "epoch": 0.6092727972297037, + "flos": 699014665728.0, + "grad_norm": 0.10375532619284132, + "language_loss": 0.84533244, + "learning_rate": 0.0003497048753032406, + "loss": 0.85626066, + "num_input_tokens_seen": 264250912, + "router_z_loss_mlp": 0.16174316, + "step": 3167, + "time_per_iteration": 2.8883583545684814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092408, + "balance_loss_mlp": 1.07648182, + "epoch": 0.6094651789149673, + "flos": 1051946735616.0, + "grad_norm": 0.06471277204040406, + "language_loss": 0.80592054, + "learning_rate": 0.000349407769967494, + "loss": 0.81684464, + "num_input_tokens_seen": 264342800, + "router_z_loss_mlp": 0.15917969, + "step": 3168, + "time_per_iteration": 3.386155605316162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099872, + "balance_loss_mlp": 1.08381498, + "epoch": 0.6096575606002309, + "flos": 503085883392.0, + "grad_norm": 0.11400005862882004, + "language_loss": 0.84987879, + "learning_rate": 0.0003491107230968361, + "loss": 0.86087751, + "num_input_tokens_seen": 264413664, + "router_z_loss_mlp": 0.16052246, + "step": 3169, + "time_per_iteration": 2.6899755001068115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096804, + "balance_loss_mlp": 1.08061576, + "epoch": 0.6098499422854944, + "flos": 585643281408.0, + "grad_norm": 0.06652355990642472, + "language_loss": 0.81221354, + "learning_rate": 0.00034881373480659085, + "loss": 0.82318163, + "num_input_tokens_seen": 264494944, + "router_z_loss_mlp": 0.16186523, + "step": 3170, + "time_per_iteration": 2.8547778129577637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101156, + "balance_loss_mlp": 1.08508694, + "epoch": 0.610042323970758, + "flos": 469205996544.0, + "grad_norm": 0.08688268797683278, + "language_loss": 0.77884257, + "learning_rate": 0.0003485168052120594, + "loss": 0.78985405, + "num_input_tokens_seen": 264561664, + "router_z_loss_mlp": 0.16064453, + "step": 3171, + "time_per_iteration": 2.6543068885803223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110044, + "balance_loss_mlp": 1.08477592, + "epoch": 0.6102347056560216, + "flos": 514177403904.0, + "grad_norm": 0.09027989422234346, + "language_loss": 0.79380625, + "learning_rate": 0.00034821993442851973, + "loss": 0.80481064, + "num_input_tokens_seen": 264626256, + "router_z_loss_mlp": 0.15649414, + "step": 3172, + "time_per_iteration": 2.6117188930511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100941, + "balance_loss_mlp": 1.08522928, + "epoch": 0.6104270873412851, + "flos": 469013276160.0, + "grad_norm": 0.1005367012587997, + "language_loss": 0.82141685, + "learning_rate": 0.00034792312257122735, + "loss": 0.83242625, + "num_input_tokens_seen": 264692768, + "router_z_loss_mlp": 0.15698242, + "step": 3173, + "time_per_iteration": 2.634824752807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107115, + "balance_loss_mlp": 1.09201097, + "epoch": 0.6106194690265486, + "flos": 549875837952.0, + "grad_norm": 0.07806982240241292, + "language_loss": 0.80516702, + "learning_rate": 0.00034762636975541506, + "loss": 0.81623822, + "num_input_tokens_seen": 264764816, + "router_z_loss_mlp": 0.15087891, + "step": 3174, + "time_per_iteration": 2.7511277198791504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111247, + "balance_loss_mlp": 1.09719944, + "epoch": 0.6108118507118122, + "flos": 472857772032.0, + "grad_norm": 0.09012937190678837, + "language_loss": 0.80371904, + "learning_rate": 0.0003473296760962923, + "loss": 0.81484377, + "num_input_tokens_seen": 264837968, + "router_z_loss_mlp": 0.15246582, + "step": 3175, + "time_per_iteration": 2.7333414554595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105343, + "balance_loss_mlp": 1.04603887, + "epoch": 0.6110042323970758, + "flos": 1445166904320.0, + "grad_norm": 0.017873347223140334, + "language_loss": 0.78533739, + "learning_rate": 0.00034703304170904617, + "loss": 0.79587168, + "num_input_tokens_seen": 265058336, + "router_z_loss_mlp": 0.07373047, + "step": 3176, + "time_per_iteration": 4.656734943389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112456, + "balance_loss_mlp": 1.10965848, + "epoch": 0.6111966140823394, + "flos": 794153590272.0, + "grad_norm": 0.07170779608360676, + "language_loss": 0.81361848, + "learning_rate": 0.00034673646670883976, + "loss": 0.82486403, + "num_input_tokens_seen": 265135920, + "router_z_loss_mlp": 0.14892578, + "step": 3177, + "time_per_iteration": 2.9838032722473145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_mlp": 1.04572225, + "epoch": 0.611388995767603, + "flos": 1557650663424.0, + "grad_norm": 0.018001303469989904, + "language_loss": 0.75715023, + "learning_rate": 0.0003464399512108141, + "loss": 0.76768184, + "num_input_tokens_seen": 265374464, + "router_z_loss_mlp": 0.07421875, + "step": 3178, + "time_per_iteration": 4.987392425537109 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130172, + "balance_loss_mlp": 1.11506796, + "epoch": 0.6115813774528664, + "flos": 712169210880.0, + "grad_norm": 0.0710561364168879, + "language_loss": 0.82215559, + "learning_rate": 0.0003461434953300865, + "loss": 0.83345723, + "num_input_tokens_seen": 265450112, + "router_z_loss_mlp": 0.15075684, + "step": 3179, + "time_per_iteration": 2.972102165222168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129428, + "balance_loss_mlp": 1.11437213, + "epoch": 0.61177375913813, + "flos": 684308072448.0, + "grad_norm": 0.06625806695927375, + "language_loss": 0.81118929, + "learning_rate": 0.0003458470991817515, + "loss": 0.82248354, + "num_input_tokens_seen": 265534336, + "router_z_loss_mlp": 0.15039062, + "step": 3180, + "time_per_iteration": 2.9920318126678467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138939, + "balance_loss_mlp": 1.12371635, + "epoch": 0.6119661408233936, + "flos": 511662127104.0, + "grad_norm": 0.09554430463950304, + "language_loss": 0.84819943, + "learning_rate": 0.0003455507628808802, + "loss": 0.8595888, + "num_input_tokens_seen": 265604480, + "router_z_loss_mlp": 0.15197754, + "step": 3181, + "time_per_iteration": 2.620678424835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138152, + "balance_loss_mlp": 1.12281001, + "epoch": 0.6121585225086572, + "flos": 556809002496.0, + "grad_norm": 0.07764809477009631, + "language_loss": 0.84588206, + "learning_rate": 0.00034525448654252076, + "loss": 0.85726357, + "num_input_tokens_seen": 265670848, + "router_z_loss_mlp": 0.15319824, + "step": 3182, + "time_per_iteration": 2.662243366241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132309, + "balance_loss_mlp": 1.11651397, + "epoch": 0.6123509041939207, + "flos": 561849467904.0, + "grad_norm": 0.08919622612772353, + "language_loss": 0.8301183, + "learning_rate": 0.0003449582702816976, + "loss": 0.84144139, + "num_input_tokens_seen": 265739584, + "router_z_loss_mlp": 0.15783691, + "step": 3183, + "time_per_iteration": 2.696509599685669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131366, + "balance_loss_mlp": 1.11577392, + "epoch": 0.6125432858791843, + "flos": 558056729088.0, + "grad_norm": 0.07246136408920362, + "language_loss": 0.82839715, + "learning_rate": 0.0003446621142134122, + "loss": 0.83971083, + "num_input_tokens_seen": 265810368, + "router_z_loss_mlp": 0.15576172, + "step": 3184, + "time_per_iteration": 2.6876282691955566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129534, + "balance_loss_mlp": 1.1142869, + "epoch": 0.6127356675644479, + "flos": 415015944192.0, + "grad_norm": 0.10207734274681185, + "language_loss": 0.84166813, + "learning_rate": 0.0003443660184526424, + "loss": 0.85296345, + "num_input_tokens_seen": 265871616, + "router_z_loss_mlp": 0.15222168, + "step": 3185, + "time_per_iteration": 2.457191228866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126874, + "balance_loss_mlp": 1.11150801, + "epoch": 0.6129280492497114, + "flos": 603843434496.0, + "grad_norm": 0.08690649590486366, + "language_loss": 0.86419243, + "learning_rate": 0.0003440699831143429, + "loss": 0.8754611, + "num_input_tokens_seen": 265946672, + "router_z_loss_mlp": 0.15356445, + "step": 3186, + "time_per_iteration": 2.7862656116485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117106, + "balance_loss_mlp": 1.10134614, + "epoch": 0.613120430934975, + "flos": 519766295040.0, + "grad_norm": 0.09433598630753232, + "language_loss": 0.82150078, + "learning_rate": 0.0003437740083134449, + "loss": 0.83267176, + "num_input_tokens_seen": 266020640, + "router_z_loss_mlp": 0.1574707, + "step": 3187, + "time_per_iteration": 2.732182502746582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110686, + "balance_loss_mlp": 1.09489119, + "epoch": 0.6133128126202385, + "flos": 511083965952.0, + "grad_norm": 0.107565485764287, + "language_loss": 0.83600903, + "learning_rate": 0.00034347809416485574, + "loss": 0.84711587, + "num_input_tokens_seen": 266085776, + "router_z_loss_mlp": 0.15783691, + "step": 3188, + "time_per_iteration": 2.5941028594970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.09327221, + "epoch": 0.6135051943055021, + "flos": 607562021376.0, + "grad_norm": 0.07306418964956934, + "language_loss": 0.81643283, + "learning_rate": 0.0003431822407834597, + "loss": 0.82752192, + "num_input_tokens_seen": 266157104, + "router_z_loss_mlp": 0.15625, + "step": 3189, + "time_per_iteration": 2.79345440864563 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107509, + "balance_loss_mlp": 1.09151149, + "epoch": 0.6136975759907657, + "flos": 1160200931328.0, + "grad_norm": 0.07663580973151435, + "language_loss": 0.83989727, + "learning_rate": 0.00034288644828411706, + "loss": 0.85097235, + "num_input_tokens_seen": 266244144, + "router_z_loss_mlp": 0.15991211, + "step": 3190, + "time_per_iteration": 3.495431423187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107683, + "balance_loss_mlp": 1.0914706, + "epoch": 0.6138899576760293, + "flos": 706938596352.0, + "grad_norm": 0.09805760174561111, + "language_loss": 0.75479543, + "learning_rate": 0.0003425907167816649, + "loss": 0.76587236, + "num_input_tokens_seen": 266319040, + "router_z_loss_mlp": 0.16210938, + "step": 3191, + "time_per_iteration": 2.890688896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100168, + "balance_loss_mlp": 1.0839076, + "epoch": 0.6140823393612928, + "flos": 586443898368.0, + "grad_norm": 0.08119558149243, + "language_loss": 0.84596795, + "learning_rate": 0.00034229504639091623, + "loss": 0.85696959, + "num_input_tokens_seen": 266390784, + "router_z_loss_mlp": 0.16259766, + "step": 3192, + "time_per_iteration": 2.799213171005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110546, + "balance_loss_mlp": 1.08940232, + "epoch": 0.6142747210465563, + "flos": 804130633728.0, + "grad_norm": 0.13197057459029027, + "language_loss": 0.79937923, + "learning_rate": 0.0003419994372266606, + "loss": 0.81043386, + "num_input_tokens_seen": 266483216, + "router_z_loss_mlp": 0.16052246, + "step": 3193, + "time_per_iteration": 3.1180262565612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103547, + "balance_loss_mlp": 1.08715582, + "epoch": 0.6144671027318199, + "flos": 529434620928.0, + "grad_norm": 0.07478792325095046, + "language_loss": 0.81555808, + "learning_rate": 0.00034170388940366335, + "loss": 0.82659352, + "num_input_tokens_seen": 266557344, + "router_z_loss_mlp": 0.16381836, + "step": 3194, + "time_per_iteration": 2.7108078002929688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105556, + "balance_loss_mlp": 1.08935523, + "epoch": 0.6146594844170835, + "flos": 805425348096.0, + "grad_norm": 0.1666581336707107, + "language_loss": 0.80146444, + "learning_rate": 0.0003414084030366667, + "loss": 0.81251997, + "num_input_tokens_seen": 266639488, + "router_z_loss_mlp": 0.16210938, + "step": 3195, + "time_per_iteration": 3.146375894546509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098435, + "balance_loss_mlp": 1.08159113, + "epoch": 0.6148518661023471, + "flos": 501697193472.0, + "grad_norm": 0.07855669714866301, + "language_loss": 0.82993454, + "learning_rate": 0.0003411129782403883, + "loss": 0.8409189, + "num_input_tokens_seen": 266711168, + "router_z_loss_mlp": 0.1685791, + "step": 3196, + "time_per_iteration": 2.6907546520233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102656, + "balance_loss_mlp": 1.0864203, + "epoch": 0.6150442477876106, + "flos": 510688613376.0, + "grad_norm": 0.08662161159961286, + "language_loss": 0.84978783, + "learning_rate": 0.0003408176151295225, + "loss": 0.86081439, + "num_input_tokens_seen": 266777632, + "router_z_loss_mlp": 0.16235352, + "step": 3197, + "time_per_iteration": 2.7353785037994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098614, + "balance_loss_mlp": 1.08207965, + "epoch": 0.6152366294728742, + "flos": 527005979136.0, + "grad_norm": 0.11963983083590954, + "language_loss": 0.77372497, + "learning_rate": 0.00034052231381873944, + "loss": 0.78471112, + "num_input_tokens_seen": 266842880, + "router_z_loss_mlp": 0.16540527, + "step": 3198, + "time_per_iteration": 2.673388957977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097817, + "balance_loss_mlp": 1.08129418, + "epoch": 0.6154290111581378, + "flos": 473300112384.0, + "grad_norm": 0.07877091537638886, + "language_loss": 0.84876865, + "learning_rate": 0.00034022707442268494, + "loss": 0.85974681, + "num_input_tokens_seen": 266909504, + "router_z_loss_mlp": 0.1652832, + "step": 3199, + "time_per_iteration": 2.5626182556152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090925, + "balance_loss_mlp": 1.07454538, + "epoch": 0.6156213928434013, + "flos": 550819616256.0, + "grad_norm": 0.07568498479176501, + "language_loss": 0.81815386, + "learning_rate": 0.0003399318970559813, + "loss": 0.82906306, + "num_input_tokens_seen": 266988880, + "router_z_loss_mlp": 0.16381836, + "step": 3200, + "time_per_iteration": 2.829237461090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108966, + "balance_loss_mlp": 1.07353139, + "epoch": 0.6158137745286649, + "flos": 750941259264.0, + "grad_norm": 0.2497942099132976, + "language_loss": 0.8433665, + "learning_rate": 0.00033963678183322656, + "loss": 0.85426307, + "num_input_tokens_seen": 267074512, + "router_z_loss_mlp": 0.16125488, + "step": 3201, + "time_per_iteration": 3.1063387393951416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087739, + "balance_loss_mlp": 1.07162154, + "epoch": 0.6160061562139284, + "flos": 555815665152.0, + "grad_norm": 0.06940460952874025, + "language_loss": 0.82539898, + "learning_rate": 0.0003393417288689945, + "loss": 0.83627635, + "num_input_tokens_seen": 267147952, + "router_z_loss_mlp": 0.16113281, + "step": 3202, + "time_per_iteration": 2.7065072059631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093588, + "balance_loss_mlp": 1.0775063, + "epoch": 0.616198537899192, + "flos": 742177437696.0, + "grad_norm": 0.08060008317875632, + "language_loss": 0.75810564, + "learning_rate": 0.00033904673827783504, + "loss": 0.76904154, + "num_input_tokens_seen": 267224368, + "router_z_loss_mlp": 0.1607666, + "step": 3203, + "time_per_iteration": 2.976076364517212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091051, + "balance_loss_mlp": 1.07505345, + "epoch": 0.6163909195844556, + "flos": 478810082304.0, + "grad_norm": 0.05609765928721304, + "language_loss": 0.81290334, + "learning_rate": 0.00033875181017427357, + "loss": 0.8238138, + "num_input_tokens_seen": 267292688, + "router_z_loss_mlp": 0.15991211, + "step": 3204, + "time_per_iteration": 2.617102861404419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094322, + "balance_loss_mlp": 1.0783596, + "epoch": 0.6165833012697192, + "flos": 531517469184.0, + "grad_norm": 0.06962026765049416, + "language_loss": 0.80802751, + "learning_rate": 0.00033845694467281133, + "loss": 0.81897068, + "num_input_tokens_seen": 267371888, + "router_z_loss_mlp": 0.1595459, + "step": 3205, + "time_per_iteration": 2.9406063556671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100233, + "balance_loss_mlp": 1.08492684, + "epoch": 0.6167756829549826, + "flos": 807765156864.0, + "grad_norm": 0.08157941962089017, + "language_loss": 0.83428419, + "learning_rate": 0.00033816214188792516, + "loss": 0.84528655, + "num_input_tokens_seen": 267458784, + "router_z_loss_mlp": 0.1529541, + "step": 3206, + "time_per_iteration": 3.1819798946380615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097691, + "balance_loss_mlp": 1.08228946, + "epoch": 0.6169680646402462, + "flos": 488928089088.0, + "grad_norm": 0.0725317157216798, + "language_loss": 0.85080433, + "learning_rate": 0.00033786740193406784, + "loss": 0.86178124, + "num_input_tokens_seen": 267528528, + "router_z_loss_mlp": 0.15380859, + "step": 3207, + "time_per_iteration": 2.5949695110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099512, + "balance_loss_mlp": 1.08397925, + "epoch": 0.6171604463255098, + "flos": 618954918912.0, + "grad_norm": 0.09100196338205928, + "language_loss": 0.81269908, + "learning_rate": 0.00033757272492566736, + "loss": 0.82369423, + "num_input_tokens_seen": 267611152, + "router_z_loss_mlp": 0.15515137, + "step": 3208, + "time_per_iteration": 2.896113157272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101736, + "balance_loss_mlp": 1.08576202, + "epoch": 0.6173528280107734, + "flos": 528859031040.0, + "grad_norm": 0.061084762656912546, + "language_loss": 0.86857277, + "learning_rate": 0.0003372781109771278, + "loss": 0.87959015, + "num_input_tokens_seen": 267681520, + "router_z_loss_mlp": 0.15966797, + "step": 3209, + "time_per_iteration": 2.7744648456573486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098286, + "balance_loss_mlp": 1.08235943, + "epoch": 0.617545209696037, + "flos": 596581728768.0, + "grad_norm": 0.0666635733945454, + "language_loss": 0.7634722, + "learning_rate": 0.0003369835602028281, + "loss": 0.77445507, + "num_input_tokens_seen": 267758768, + "router_z_loss_mlp": 0.15917969, + "step": 3210, + "time_per_iteration": 2.807690143585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109414, + "balance_loss_mlp": 1.07845259, + "epoch": 0.6177375913813005, + "flos": 475098835968.0, + "grad_norm": 0.06505304980204422, + "language_loss": 0.79307866, + "learning_rate": 0.0003366890727171232, + "loss": 0.80402005, + "num_input_tokens_seen": 267831056, + "router_z_loss_mlp": 0.15673828, + "step": 3211, + "time_per_iteration": 2.6847074031829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092887, + "balance_loss_mlp": 1.07701993, + "epoch": 0.617929973066564, + "flos": 529812721152.0, + "grad_norm": 0.08815950120803863, + "language_loss": 0.78273273, + "learning_rate": 0.00033639464863434313, + "loss": 0.79366159, + "num_input_tokens_seen": 267898416, + "router_z_loss_mlp": 0.15856934, + "step": 3212, + "time_per_iteration": 2.6401009559631348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_mlp": 1.0277102, + "epoch": 0.6181223547518276, + "flos": 1420053783552.0, + "grad_norm": 0.026269033760010364, + "language_loss": 0.78442466, + "learning_rate": 0.00033610028806879363, + "loss": 0.79478669, + "num_input_tokens_seen": 268112864, + "router_z_loss_mlp": 0.08496094, + "step": 3213, + "time_per_iteration": 4.715362787246704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.07037783, + "epoch": 0.6183147364370912, + "flos": 740319243264.0, + "grad_norm": 0.0738307593479646, + "language_loss": 0.79866982, + "learning_rate": 0.00033580599113475543, + "loss": 0.80953264, + "num_input_tokens_seen": 268198368, + "router_z_loss_mlp": 0.15893555, + "step": 3214, + "time_per_iteration": 3.000586986541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085789, + "balance_loss_mlp": 1.06937385, + "epoch": 0.6185071181223547, + "flos": 381649978368.0, + "grad_norm": 0.07082068470291375, + "language_loss": 0.86112303, + "learning_rate": 0.00033551175794648507, + "loss": 0.87198091, + "num_input_tokens_seen": 268260704, + "router_z_loss_mlp": 0.16418457, + "step": 3215, + "time_per_iteration": 2.494271755218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090218, + "balance_loss_mlp": 1.0744828, + "epoch": 0.6186994998076183, + "flos": 463347661824.0, + "grad_norm": 0.12386747006326235, + "language_loss": 0.81595516, + "learning_rate": 0.00033521758861821365, + "loss": 0.82685733, + "num_input_tokens_seen": 268328256, + "router_z_loss_mlp": 0.15722656, + "step": 3216, + "time_per_iteration": 2.646888256072998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084021, + "balance_loss_mlp": 1.06845176, + "epoch": 0.6188918814928819, + "flos": 485273742336.0, + "grad_norm": 0.07895450419788622, + "language_loss": 0.88963878, + "learning_rate": 0.0003349234832641479, + "loss": 0.90047896, + "num_input_tokens_seen": 268394016, + "router_z_loss_mlp": 0.15551758, + "step": 3217, + "time_per_iteration": 2.603308916091919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_mlp": 1.06719124, + "epoch": 0.6190842631781455, + "flos": 657307021824.0, + "grad_norm": 0.07412246330535043, + "language_loss": 0.808752, + "learning_rate": 0.00033462944199846975, + "loss": 0.81957746, + "num_input_tokens_seen": 268478512, + "router_z_loss_mlp": 0.15332031, + "step": 3218, + "time_per_iteration": 3.086716413497925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083407, + "balance_loss_mlp": 1.06795752, + "epoch": 0.619276644863409, + "flos": 403603223040.0, + "grad_norm": 0.07145505501141985, + "language_loss": 0.86298114, + "learning_rate": 0.00033433546493533606, + "loss": 0.87381524, + "num_input_tokens_seen": 268540304, + "router_z_loss_mlp": 0.15429688, + "step": 3219, + "time_per_iteration": 2.525264024734497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079767, + "balance_loss_mlp": 1.06349516, + "epoch": 0.6194690265486725, + "flos": 583093499904.0, + "grad_norm": 0.086291171152169, + "language_loss": 0.83994114, + "learning_rate": 0.00033404155218887897, + "loss": 0.85073888, + "num_input_tokens_seen": 268611136, + "router_z_loss_mlp": 0.16271973, + "step": 3220, + "time_per_iteration": 2.7530763149261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080633, + "balance_loss_mlp": 1.06478977, + "epoch": 0.6196614082339361, + "flos": 504246974976.0, + "grad_norm": 0.11530682173053017, + "language_loss": 0.87328637, + "learning_rate": 0.00033374770387320534, + "loss": 0.88409269, + "num_input_tokens_seen": 268684992, + "router_z_loss_mlp": 0.1583252, + "step": 3221, + "time_per_iteration": 2.769804000854492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107728, + "balance_loss_mlp": 1.06110358, + "epoch": 0.6198537899191997, + "flos": 575409277440.0, + "grad_norm": 0.09653805931546991, + "language_loss": 0.84981918, + "learning_rate": 0.00033345392010239737, + "loss": 0.86059201, + "num_input_tokens_seen": 268758096, + "router_z_loss_mlp": 0.16174316, + "step": 3222, + "time_per_iteration": 2.742431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080173, + "balance_loss_mlp": 1.0643059, + "epoch": 0.6200461716044633, + "flos": 593157178368.0, + "grad_norm": 0.08405780593634497, + "language_loss": 0.82221037, + "learning_rate": 0.0003331602009905118, + "loss": 0.8330121, + "num_input_tokens_seen": 268834432, + "router_z_loss_mlp": 0.15856934, + "step": 3223, + "time_per_iteration": 2.8276350498199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075434, + "balance_loss_mlp": 1.05924559, + "epoch": 0.6202385532897268, + "flos": 666093238272.0, + "grad_norm": 0.16424334065153295, + "language_loss": 0.83946419, + "learning_rate": 0.00033286654665158085, + "loss": 0.85021853, + "num_input_tokens_seen": 268921168, + "router_z_loss_mlp": 0.16186523, + "step": 3224, + "time_per_iteration": 3.0171141624450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074402, + "balance_loss_mlp": 1.05797529, + "epoch": 0.6204309349749904, + "flos": 484952541696.0, + "grad_norm": 0.07119512834175158, + "language_loss": 0.8751117, + "learning_rate": 0.0003325729571996109, + "loss": 0.88585573, + "num_input_tokens_seen": 268991440, + "router_z_loss_mlp": 0.16430664, + "step": 3225, + "time_per_iteration": 2.6336770057678223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107489, + "balance_loss_mlp": 1.05822468, + "epoch": 0.6206233166602539, + "flos": 584057101824.0, + "grad_norm": 0.07015160541541936, + "language_loss": 0.83497381, + "learning_rate": 0.000332279432748584, + "loss": 0.84572268, + "num_input_tokens_seen": 269061024, + "router_z_loss_mlp": 0.16674805, + "step": 3226, + "time_per_iteration": 2.8068268299102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077491, + "balance_loss_mlp": 1.06129014, + "epoch": 0.6208156983455175, + "flos": 476917383168.0, + "grad_norm": 0.08244551299177609, + "language_loss": 0.87847024, + "learning_rate": 0.00033198597341245576, + "loss": 0.88924515, + "num_input_tokens_seen": 269130560, + "router_z_loss_mlp": 0.1619873, + "step": 3227, + "time_per_iteration": 2.6014742851257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070414, + "balance_loss_mlp": 1.05366468, + "epoch": 0.6210080800307811, + "flos": 789066137088.0, + "grad_norm": 0.25336628226947533, + "language_loss": 0.82029134, + "learning_rate": 0.00033169257930515763, + "loss": 0.83099544, + "num_input_tokens_seen": 269213280, + "router_z_loss_mlp": 0.16760254, + "step": 3228, + "time_per_iteration": 3.086378335952759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080291, + "balance_loss_mlp": 1.06385183, + "epoch": 0.6212004617160446, + "flos": 607794388992.0, + "grad_norm": 0.06847993393240591, + "language_loss": 0.81926602, + "learning_rate": 0.0003313992505405951, + "loss": 0.83006895, + "num_input_tokens_seen": 269286384, + "router_z_loss_mlp": 0.16442871, + "step": 3229, + "time_per_iteration": 2.721404552459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108161, + "balance_loss_mlp": 1.06523085, + "epoch": 0.6213928434013082, + "flos": 586520621568.0, + "grad_norm": 0.08774924487902723, + "language_loss": 0.81243527, + "learning_rate": 0.0003311059872326487, + "loss": 0.82325131, + "num_input_tokens_seen": 269353296, + "router_z_loss_mlp": 0.16381836, + "step": 3230, + "time_per_iteration": 2.698370933532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108911, + "balance_loss_mlp": 1.07283747, + "epoch": 0.6215852250865718, + "flos": 536076320256.0, + "grad_norm": 0.06270851897860089, + "language_loss": 0.79239869, + "learning_rate": 0.0003308127894951734, + "loss": 0.80328983, + "num_input_tokens_seen": 269422304, + "router_z_loss_mlp": 0.16271973, + "step": 3231, + "time_per_iteration": 2.642587900161743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103422, + "balance_loss_mlp": 1.08775783, + "epoch": 0.6217776067718354, + "flos": 618169356288.0, + "grad_norm": 0.08661735945453952, + "language_loss": 0.86286879, + "learning_rate": 0.00033051965744199834, + "loss": 0.87390304, + "num_input_tokens_seen": 269498784, + "router_z_loss_mlp": 0.15649414, + "step": 3232, + "time_per_iteration": 2.7654480934143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110456, + "balance_loss_mlp": 1.08876467, + "epoch": 0.6219699884570988, + "flos": 545875324416.0, + "grad_norm": 0.08070984322149112, + "language_loss": 0.90182227, + "learning_rate": 0.0003302265911869276, + "loss": 0.91286784, + "num_input_tokens_seen": 269581264, + "router_z_loss_mlp": 0.15795898, + "step": 3233, + "time_per_iteration": 2.973137378692627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102412, + "balance_loss_mlp": 1.0863899, + "epoch": 0.6221623701423624, + "flos": 481149891072.0, + "grad_norm": 0.10903375315804033, + "language_loss": 0.83981085, + "learning_rate": 0.0003299335908437397, + "loss": 0.85083497, + "num_input_tokens_seen": 269649408, + "router_z_loss_mlp": 0.16015625, + "step": 3234, + "time_per_iteration": 2.6683669090270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110257, + "balance_loss_mlp": 1.08685815, + "epoch": 0.622354751827626, + "flos": 380024151552.0, + "grad_norm": 0.08931018897921299, + "language_loss": 0.79380894, + "learning_rate": 0.0003296406565261873, + "loss": 0.8048346, + "num_input_tokens_seen": 269711648, + "router_z_loss_mlp": 0.15698242, + "step": 3235, + "time_per_iteration": 2.4825046062469482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093994, + "balance_loss_mlp": 1.07830596, + "epoch": 0.6225471335128896, + "flos": 667869940224.0, + "grad_norm": 0.08356203677031868, + "language_loss": 0.84839869, + "learning_rate": 0.0003293477883479978, + "loss": 0.85933864, + "num_input_tokens_seen": 269787376, + "router_z_loss_mlp": 0.15673828, + "step": 3236, + "time_per_iteration": 2.855417013168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.08046174, + "epoch": 0.6227395151981532, + "flos": 771320807424.0, + "grad_norm": 0.0752906084942527, + "language_loss": 0.79873055, + "learning_rate": 0.0003290549864228727, + "loss": 0.80969167, + "num_input_tokens_seen": 269863008, + "router_z_loss_mlp": 0.15637207, + "step": 3237, + "time_per_iteration": 2.954319953918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094914, + "balance_loss_mlp": 1.07898724, + "epoch": 0.6229318968834167, + "flos": 484354556928.0, + "grad_norm": 0.0798274919474459, + "language_loss": 0.86145848, + "learning_rate": 0.0003287622508644875, + "loss": 0.87240762, + "num_input_tokens_seen": 269939552, + "router_z_loss_mlp": 0.15917969, + "step": 3238, + "time_per_iteration": 2.7834508419036865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095092, + "balance_loss_mlp": 1.07920146, + "epoch": 0.6231242785686802, + "flos": 462935056896.0, + "grad_norm": 0.08228635643627878, + "language_loss": 0.86427939, + "learning_rate": 0.0003284695817864923, + "loss": 0.87523031, + "num_input_tokens_seen": 270002752, + "router_z_loss_mlp": 0.15881348, + "step": 3239, + "time_per_iteration": 2.52299427986145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089491, + "balance_loss_mlp": 1.07388628, + "epoch": 0.6233166602539438, + "flos": 609089103360.0, + "grad_norm": 0.07912840789320032, + "language_loss": 0.83886796, + "learning_rate": 0.0003281769793025116, + "loss": 0.84976286, + "num_input_tokens_seen": 270075696, + "router_z_loss_mlp": 0.15588379, + "step": 3240, + "time_per_iteration": 2.736513614654541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090298, + "balance_loss_mlp": 1.07525432, + "epoch": 0.6235090419392074, + "flos": 439200340992.0, + "grad_norm": 0.08036892690919402, + "language_loss": 0.89556086, + "learning_rate": 0.00032788444352614346, + "loss": 0.90646392, + "num_input_tokens_seen": 270139872, + "router_z_loss_mlp": 0.15014648, + "step": 3241, + "time_per_iteration": 2.532486915588379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091662, + "balance_loss_mlp": 1.07645059, + "epoch": 0.6237014236244709, + "flos": 504904430592.0, + "grad_norm": 0.10748346186941515, + "language_loss": 0.80754519, + "learning_rate": 0.0003275919745709606, + "loss": 0.81846178, + "num_input_tokens_seen": 270206752, + "router_z_loss_mlp": 0.15197754, + "step": 3242, + "time_per_iteration": 2.6164467334747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093762, + "balance_loss_mlp": 1.07853913, + "epoch": 0.6238938053097345, + "flos": 512917194240.0, + "grad_norm": 0.07410139780614007, + "language_loss": 0.82327247, + "learning_rate": 0.00032729957255050936, + "loss": 0.83421004, + "num_input_tokens_seen": 270275472, + "router_z_loss_mlp": 0.15197754, + "step": 3243, + "time_per_iteration": 2.711912155151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094763, + "balance_loss_mlp": 1.07895613, + "epoch": 0.6240861869949981, + "flos": 736751531520.0, + "grad_norm": 0.07913543428232035, + "language_loss": 0.81355995, + "learning_rate": 0.0003270072375783102, + "loss": 0.82450759, + "num_input_tokens_seen": 270348336, + "router_z_loss_mlp": 0.15795898, + "step": 3244, + "time_per_iteration": 2.896878242492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091166, + "balance_loss_mlp": 1.07548952, + "epoch": 0.6242785686802617, + "flos": 494712271872.0, + "grad_norm": 0.10691714389102631, + "language_loss": 0.79955053, + "learning_rate": 0.00032671496976785774, + "loss": 0.81046224, + "num_input_tokens_seen": 270416496, + "router_z_loss_mlp": 0.15661621, + "step": 3245, + "time_per_iteration": 2.6352155208587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089637, + "balance_loss_mlp": 1.07429433, + "epoch": 0.6244709503655252, + "flos": 745846465536.0, + "grad_norm": 0.06870861562769151, + "language_loss": 0.75493729, + "learning_rate": 0.0003264227692326205, + "loss": 0.76583362, + "num_input_tokens_seen": 270501680, + "router_z_loss_mlp": 0.15319824, + "step": 3246, + "time_per_iteration": 3.093111991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098568, + "balance_loss_mlp": 1.08292735, + "epoch": 0.6246633320507887, + "flos": 492602259456.0, + "grad_norm": 0.06424326039406808, + "language_loss": 0.85849744, + "learning_rate": 0.00032613063608604055, + "loss": 0.86948311, + "num_input_tokens_seen": 270568656, + "router_z_loss_mlp": 0.15625, + "step": 3247, + "time_per_iteration": 2.5499489307403564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109153, + "balance_loss_mlp": 1.07599711, + "epoch": 0.6248557137360523, + "flos": 517391981568.0, + "grad_norm": 0.07629898718313471, + "language_loss": 0.83584791, + "learning_rate": 0.0003258385704415343, + "loss": 0.84676319, + "num_input_tokens_seen": 270636160, + "router_z_loss_mlp": 0.15515137, + "step": 3248, + "time_per_iteration": 2.6027162075042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098413, + "balance_loss_mlp": 1.08287978, + "epoch": 0.6250480954213159, + "flos": 519363601920.0, + "grad_norm": 0.08365862742240879, + "language_loss": 0.83149463, + "learning_rate": 0.0003255465724124915, + "loss": 0.84247875, + "num_input_tokens_seen": 270708816, + "router_z_loss_mlp": 0.15515137, + "step": 3249, + "time_per_iteration": 2.730041742324829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104065, + "balance_loss_mlp": 1.08905637, + "epoch": 0.6252404771065795, + "flos": 516060191232.0, + "grad_norm": 0.06996210477337128, + "language_loss": 0.82732821, + "learning_rate": 0.00032525464211227587, + "loss": 0.83836889, + "num_input_tokens_seen": 270778016, + "router_z_loss_mlp": 0.14990234, + "step": 3250, + "time_per_iteration": 2.610226631164551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103584, + "balance_loss_mlp": 1.08822954, + "epoch": 0.6254328587918431, + "flos": 576916535808.0, + "grad_norm": 0.07802302552021714, + "language_loss": 0.85721552, + "learning_rate": 0.0003249627796542249, + "loss": 0.86825138, + "num_input_tokens_seen": 270847072, + "router_z_loss_mlp": 0.15344238, + "step": 3251, + "time_per_iteration": 2.6803338527679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096481, + "balance_loss_mlp": 1.08087671, + "epoch": 0.6256252404771065, + "flos": 597930771456.0, + "grad_norm": 0.06796886597931054, + "language_loss": 0.84280014, + "learning_rate": 0.00032467098515164943, + "loss": 0.85376501, + "num_input_tokens_seen": 270926320, + "router_z_loss_mlp": 0.15588379, + "step": 3252, + "time_per_iteration": 2.904672861099243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111129, + "balance_loss_mlp": 1.0956316, + "epoch": 0.6258176221623701, + "flos": 508299245568.0, + "grad_norm": 0.09344441617703737, + "language_loss": 0.84051675, + "learning_rate": 0.00032437925871783456, + "loss": 0.85162807, + "num_input_tokens_seen": 270997904, + "router_z_loss_mlp": 0.15490723, + "step": 3253, + "time_per_iteration": 2.704474925994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101193, + "balance_loss_mlp": 1.08483791, + "epoch": 0.6260100038476337, + "flos": 639645755904.0, + "grad_norm": 0.07749015001842677, + "language_loss": 0.84249985, + "learning_rate": 0.00032408760046603803, + "loss": 0.85351181, + "num_input_tokens_seen": 271074256, + "router_z_loss_mlp": 0.16357422, + "step": 3254, + "time_per_iteration": 2.849126100540161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103151, + "balance_loss_mlp": 1.08711767, + "epoch": 0.6262023855328973, + "flos": 841007784960.0, + "grad_norm": 0.06356173673048542, + "language_loss": 0.77591729, + "learning_rate": 0.00032379601050949193, + "loss": 0.7869488, + "num_input_tokens_seen": 271155152, + "router_z_loss_mlp": 0.16027832, + "step": 3255, + "time_per_iteration": 3.119446039199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091759, + "balance_loss_mlp": 1.07567763, + "epoch": 0.6263947672181608, + "flos": 522138410496.0, + "grad_norm": 0.07099798936628814, + "language_loss": 0.88052809, + "learning_rate": 0.0003235044889614013, + "loss": 0.8914457, + "num_input_tokens_seen": 271224784, + "router_z_loss_mlp": 0.1607666, + "step": 3256, + "time_per_iteration": 2.613060235977173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094317, + "balance_loss_mlp": 1.07879567, + "epoch": 0.6265871489034244, + "flos": 607055440896.0, + "grad_norm": 0.09103285060776488, + "language_loss": 0.8368516, + "learning_rate": 0.0003232130359349451, + "loss": 0.84779477, + "num_input_tokens_seen": 271303584, + "router_z_loss_mlp": 0.1550293, + "step": 3257, + "time_per_iteration": 2.8671774864196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089032, + "balance_loss_mlp": 1.07287872, + "epoch": 0.626779530588688, + "flos": 588484901376.0, + "grad_norm": 0.0836607688375681, + "language_loss": 0.81645948, + "learning_rate": 0.0003229216515432751, + "loss": 0.82734984, + "num_input_tokens_seen": 271379632, + "router_z_loss_mlp": 0.16149902, + "step": 3258, + "time_per_iteration": 2.8217055797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093741, + "balance_loss_mlp": 1.0781126, + "epoch": 0.6269719122739515, + "flos": 438612268032.0, + "grad_norm": 0.07437080519931394, + "language_loss": 0.79591352, + "learning_rate": 0.0003226303358995174, + "loss": 0.80685091, + "num_input_tokens_seen": 271447808, + "router_z_loss_mlp": 0.15612793, + "step": 3259, + "time_per_iteration": 2.613922595977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109242, + "balance_loss_mlp": 1.07588553, + "epoch": 0.6271642939592151, + "flos": 562874738688.0, + "grad_norm": 0.06263163093589014, + "language_loss": 0.88819879, + "learning_rate": 0.00032233908911677, + "loss": 0.89912301, + "num_input_tokens_seen": 271526768, + "router_z_loss_mlp": 0.16540527, + "step": 3260, + "time_per_iteration": 2.855600118637085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109092, + "balance_loss_mlp": 1.07450485, + "epoch": 0.6273566756444786, + "flos": 514560273408.0, + "grad_norm": 0.06460016363514721, + "language_loss": 0.80802065, + "learning_rate": 0.0003220479113081053, + "loss": 0.81892991, + "num_input_tokens_seen": 271597840, + "router_z_loss_mlp": 0.16418457, + "step": 3261, + "time_per_iteration": 2.753509759902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085244, + "balance_loss_mlp": 1.06910312, + "epoch": 0.6275490573297422, + "flos": 585472955904.0, + "grad_norm": 0.074937478592973, + "language_loss": 0.79032731, + "learning_rate": 0.00032175680258656836, + "loss": 0.80117977, + "num_input_tokens_seen": 271668352, + "router_z_loss_mlp": 0.16137695, + "step": 3262, + "time_per_iteration": 2.7336065769195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.06954229, + "epoch": 0.6277414390150058, + "flos": 559423024128.0, + "grad_norm": 0.06015193391132931, + "language_loss": 0.79762304, + "learning_rate": 0.00032146576306517794, + "loss": 0.80847853, + "num_input_tokens_seen": 271743936, + "router_z_loss_mlp": 0.16003418, + "step": 3263, + "time_per_iteration": 2.8162710666656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087014, + "balance_loss_mlp": 1.070611, + "epoch": 0.6279338207002694, + "flos": 612706374144.0, + "grad_norm": 0.08732390262483163, + "language_loss": 0.80907923, + "learning_rate": 0.0003211747928569255, + "loss": 0.81994939, + "num_input_tokens_seen": 271817008, + "router_z_loss_mlp": 0.1640625, + "step": 3264, + "time_per_iteration": 2.7805709838867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087684, + "balance_loss_mlp": 1.07150757, + "epoch": 0.6281262023855329, + "flos": 625685451264.0, + "grad_norm": 0.06366142393715324, + "language_loss": 0.81574047, + "learning_rate": 0.0003208838920747754, + "loss": 0.82661736, + "num_input_tokens_seen": 271896960, + "router_z_loss_mlp": 0.16174316, + "step": 3265, + "time_per_iteration": 2.8634932041168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07176387, + "epoch": 0.6283185840707964, + "flos": 1123600564224.0, + "grad_norm": 0.06892871755232625, + "language_loss": 0.76471019, + "learning_rate": 0.0003205930608316656, + "loss": 0.77558672, + "num_input_tokens_seen": 271985008, + "router_z_loss_mlp": 0.15881348, + "step": 3266, + "time_per_iteration": 3.491633176803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088752, + "balance_loss_mlp": 1.07274199, + "epoch": 0.62851096575606, + "flos": 515239750656.0, + "grad_norm": 0.07065676872193134, + "language_loss": 0.84763551, + "learning_rate": 0.00032030229924050673, + "loss": 0.85852307, + "num_input_tokens_seen": 272056368, + "router_z_loss_mlp": 0.16003418, + "step": 3267, + "time_per_iteration": 2.7322630882263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081627, + "balance_loss_mlp": 1.0655694, + "epoch": 0.6287033474413236, + "flos": 404171472384.0, + "grad_norm": 0.076810738762244, + "language_loss": 0.80159783, + "learning_rate": 0.00032001160741418247, + "loss": 0.81241405, + "num_input_tokens_seen": 272123424, + "router_z_loss_mlp": 0.16052246, + "step": 3268, + "time_per_iteration": 2.683931589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.06859946, + "epoch": 0.6288957291265872, + "flos": 525718605312.0, + "grad_norm": 0.07050633409019491, + "language_loss": 0.81839114, + "learning_rate": 0.0003197209854655494, + "loss": 0.82922828, + "num_input_tokens_seen": 272193008, + "router_z_loss_mlp": 0.15100098, + "step": 3269, + "time_per_iteration": 2.7007665634155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088437, + "balance_loss_mlp": 1.0728085, + "epoch": 0.6290881108118507, + "flos": 603722294784.0, + "grad_norm": 0.07859150018843152, + "language_loss": 0.74576277, + "learning_rate": 0.0003194304335074371, + "loss": 0.75664711, + "num_input_tokens_seen": 272275328, + "router_z_loss_mlp": 0.15625, + "step": 3270, + "time_per_iteration": 2.8443710803985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093344, + "balance_loss_mlp": 1.07737029, + "epoch": 0.6292804924971143, + "flos": 437675830272.0, + "grad_norm": 0.07641817393063347, + "language_loss": 0.88118923, + "learning_rate": 0.0003191399516526475, + "loss": 0.89212275, + "num_input_tokens_seen": 272339328, + "router_z_loss_mlp": 0.15966797, + "step": 3271, + "time_per_iteration": 2.510565996170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109192, + "balance_loss_mlp": 1.07666111, + "epoch": 0.6294728741823779, + "flos": 606662659584.0, + "grad_norm": 0.06496379597485666, + "language_loss": 0.79376519, + "learning_rate": 0.0003188495400139559, + "loss": 0.8046844, + "num_input_tokens_seen": 272416336, + "router_z_loss_mlp": 0.15234375, + "step": 3272, + "time_per_iteration": 2.8364667892456055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095867, + "balance_loss_mlp": 1.0803932, + "epoch": 0.6296652558676414, + "flos": 701529942528.0, + "grad_norm": 0.07122529047297946, + "language_loss": 0.8439455, + "learning_rate": 0.00031855919870411013, + "loss": 0.85490417, + "num_input_tokens_seen": 272490368, + "router_z_loss_mlp": 0.15466309, + "step": 3273, + "time_per_iteration": 2.8570995330810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086291, + "balance_loss_mlp": 1.0712353, + "epoch": 0.6298576375529049, + "flos": 523909969920.0, + "grad_norm": 0.06914500829494513, + "language_loss": 0.84985608, + "learning_rate": 0.0003182689278358305, + "loss": 0.86071897, + "num_input_tokens_seen": 272562992, + "router_z_loss_mlp": 0.15039062, + "step": 3274, + "time_per_iteration": 2.757631301879883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108922, + "balance_loss_mlp": 1.07361603, + "epoch": 0.6300500192381685, + "flos": 475963693056.0, + "grad_norm": 0.07954775406848916, + "language_loss": 0.79536891, + "learning_rate": 0.0003179787275218105, + "loss": 0.80626118, + "num_input_tokens_seen": 272629456, + "router_z_loss_mlp": 0.15588379, + "step": 3275, + "time_per_iteration": 2.562164545059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083819, + "balance_loss_mlp": 1.06884634, + "epoch": 0.6302424009234321, + "flos": 520880772096.0, + "grad_norm": 0.08328401336331384, + "language_loss": 0.84322137, + "learning_rate": 0.0003176885978747155, + "loss": 0.85405958, + "num_input_tokens_seen": 272697440, + "router_z_loss_mlp": 0.14953613, + "step": 3276, + "time_per_iteration": 2.6230828762054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085747, + "balance_loss_mlp": 1.07017803, + "epoch": 0.6304347826086957, + "flos": 694596777984.0, + "grad_norm": 0.1699824723402015, + "language_loss": 0.82447994, + "learning_rate": 0.0003173985390071839, + "loss": 0.8353374, + "num_input_tokens_seen": 272774080, + "router_z_loss_mlp": 0.15551758, + "step": 3277, + "time_per_iteration": 2.913857936859131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011052, + "balance_loss_mlp": 1.00342274, + "epoch": 0.6306271642939593, + "flos": 1466858045952.0, + "grad_norm": 0.01180096248497286, + "language_loss": 0.77900457, + "learning_rate": 0.00031710855103182675, + "loss": 0.78911507, + "num_input_tokens_seen": 272998512, + "router_z_loss_mlp": 0.07617188, + "step": 3278, + "time_per_iteration": 4.810575008392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095394, + "balance_loss_mlp": 1.07975388, + "epoch": 0.6308195459792227, + "flos": 601740762624.0, + "grad_norm": 0.07584058368204265, + "language_loss": 0.81100649, + "learning_rate": 0.00031681863406122704, + "loss": 0.82196045, + "num_input_tokens_seen": 273074672, + "router_z_loss_mlp": 0.15625, + "step": 3279, + "time_per_iteration": 2.8176543712615967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094938, + "balance_loss_mlp": 1.07984567, + "epoch": 0.6310119276644863, + "flos": 726858178560.0, + "grad_norm": 0.07145164235931235, + "language_loss": 0.85147798, + "learning_rate": 0.00031652878820794087, + "loss": 0.86242729, + "num_input_tokens_seen": 273157904, + "router_z_loss_mlp": 0.15063477, + "step": 3280, + "time_per_iteration": 3.010453462600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099528, + "balance_loss_mlp": 1.08434081, + "epoch": 0.6312043093497499, + "flos": 519749042688.0, + "grad_norm": 0.08537377503877883, + "language_loss": 0.85849619, + "learning_rate": 0.00031623901358449627, + "loss": 0.86949146, + "num_input_tokens_seen": 273228160, + "router_z_loss_mlp": 0.15161133, + "step": 3281, + "time_per_iteration": 2.6708781719207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101817, + "balance_loss_mlp": 1.08709431, + "epoch": 0.6313966910350135, + "flos": 531191499264.0, + "grad_norm": 0.05886068654642298, + "language_loss": 0.88589537, + "learning_rate": 0.0003159493103033936, + "loss": 0.89691359, + "num_input_tokens_seen": 273295872, + "router_z_loss_mlp": 0.14709473, + "step": 3282, + "time_per_iteration": 2.636570930480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023039, + "balance_loss_mlp": 1.01540971, + "epoch": 0.631589072720277, + "flos": 1379887529472.0, + "grad_norm": 0.014741970221396734, + "language_loss": 0.79919052, + "learning_rate": 0.00031565967847710564, + "loss": 0.80942094, + "num_input_tokens_seen": 273524320, + "router_z_loss_mlp": 0.07617188, + "step": 3283, + "time_per_iteration": 4.921837568283081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098019, + "balance_loss_mlp": 1.08298671, + "epoch": 0.6317814544055406, + "flos": 624677432832.0, + "grad_norm": 0.06611749936023467, + "language_loss": 0.82335258, + "learning_rate": 0.0003153701182180776, + "loss": 0.83433276, + "num_input_tokens_seen": 273598544, + "router_z_loss_mlp": 0.15014648, + "step": 3284, + "time_per_iteration": 2.804680824279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100744, + "balance_loss_mlp": 1.08583045, + "epoch": 0.6319738360908042, + "flos": 498119569920.0, + "grad_norm": 0.09468051023791588, + "language_loss": 0.81480467, + "learning_rate": 0.00031508062963872655, + "loss": 0.8258121, + "num_input_tokens_seen": 273666000, + "router_z_loss_mlp": 0.14892578, + "step": 3285, + "time_per_iteration": 2.618572950363159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104547, + "balance_loss_mlp": 1.08974171, + "epoch": 0.6321662177760677, + "flos": 579760353792.0, + "grad_norm": 0.07285431421686336, + "language_loss": 0.79529119, + "learning_rate": 0.0003147912128514423, + "loss": 0.80633664, + "num_input_tokens_seen": 273742672, + "router_z_loss_mlp": 0.14794922, + "step": 3286, + "time_per_iteration": 2.7349414825439453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112049, + "balance_loss_mlp": 1.0971601, + "epoch": 0.6323585994613313, + "flos": 601486373376.0, + "grad_norm": 0.07001944194285717, + "language_loss": 0.87457585, + "learning_rate": 0.0003145018679685859, + "loss": 0.88569629, + "num_input_tokens_seen": 273813984, + "router_z_loss_mlp": 0.14868164, + "step": 3287, + "time_per_iteration": 2.735057830810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.09238625, + "epoch": 0.6325509811465948, + "flos": 528535259136.0, + "grad_norm": 0.06287056538994153, + "language_loss": 0.87662357, + "learning_rate": 0.00031421259510249134, + "loss": 0.88769281, + "num_input_tokens_seen": 273892848, + "router_z_loss_mlp": 0.1451416, + "step": 3288, + "time_per_iteration": 2.7864692211151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112538, + "balance_loss_mlp": 1.09816122, + "epoch": 0.6327433628318584, + "flos": 574262866944.0, + "grad_norm": 0.07989548298416052, + "language_loss": 0.80931014, + "learning_rate": 0.00031392339436546414, + "loss": 0.82043552, + "num_input_tokens_seen": 273971696, + "router_z_loss_mlp": 0.14355469, + "step": 3289, + "time_per_iteration": 2.8174936771392822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110895, + "balance_loss_mlp": 1.09549332, + "epoch": 0.632935744517122, + "flos": 517088033280.0, + "grad_norm": 0.0967935034115468, + "language_loss": 0.83535063, + "learning_rate": 0.00031363426586978205, + "loss": 0.84645951, + "num_input_tokens_seen": 274048096, + "router_z_loss_mlp": 0.15380859, + "step": 3290, + "time_per_iteration": 2.7781615257263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106481, + "balance_loss_mlp": 1.09155595, + "epoch": 0.6331281262023856, + "flos": 617462714880.0, + "grad_norm": 0.07036168037167431, + "language_loss": 0.84420347, + "learning_rate": 0.0003133452097276947, + "loss": 0.8552683, + "num_input_tokens_seen": 274122848, + "router_z_loss_mlp": 0.14904785, + "step": 3291, + "time_per_iteration": 2.7578635215759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098733, + "balance_loss_mlp": 1.08364153, + "epoch": 0.633320507887649, + "flos": 592954546176.0, + "grad_norm": 0.07346038815510673, + "language_loss": 0.84298337, + "learning_rate": 0.0003130562260514238, + "loss": 0.85397065, + "num_input_tokens_seen": 274198320, + "router_z_loss_mlp": 0.15075684, + "step": 3292, + "time_per_iteration": 2.798175096511841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.07720733, + "epoch": 0.6335128895729126, + "flos": 582349782528.0, + "grad_norm": 0.07455275976827726, + "language_loss": 0.81438339, + "learning_rate": 0.0003127673149531626, + "loss": 0.8253122, + "num_input_tokens_seen": 274274944, + "router_z_loss_mlp": 0.15661621, + "step": 3293, + "time_per_iteration": 2.7655112743377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086095, + "balance_loss_mlp": 1.07095516, + "epoch": 0.6337052712581762, + "flos": 453036934656.0, + "grad_norm": 0.083592197063536, + "language_loss": 0.83216, + "learning_rate": 0.0003124784765450762, + "loss": 0.84302098, + "num_input_tokens_seen": 274342384, + "router_z_loss_mlp": 0.15124512, + "step": 3294, + "time_per_iteration": 2.5880134105682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092147, + "balance_loss_mlp": 1.07686436, + "epoch": 0.6338976529434398, + "flos": 573407921664.0, + "grad_norm": 0.09213521836591561, + "language_loss": 0.79931903, + "learning_rate": 0.0003121897109393017, + "loss": 0.81024045, + "num_input_tokens_seen": 274417568, + "router_z_loss_mlp": 0.15283203, + "step": 3295, + "time_per_iteration": 2.7655093669891357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086202, + "balance_loss_mlp": 1.07047844, + "epoch": 0.6340900346287034, + "flos": 508758838272.0, + "grad_norm": 0.06242699112369121, + "language_loss": 0.88973814, + "learning_rate": 0.0003119010182479481, + "loss": 0.90060019, + "num_input_tokens_seen": 274488960, + "router_z_loss_mlp": 0.15710449, + "step": 3296, + "time_per_iteration": 2.631047010421753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086362, + "balance_loss_mlp": 1.07093644, + "epoch": 0.6342824163139669, + "flos": 479746520064.0, + "grad_norm": 0.06994096564397366, + "language_loss": 0.82599872, + "learning_rate": 0.00031161239858309563, + "loss": 0.83686233, + "num_input_tokens_seen": 274556880, + "router_z_loss_mlp": 0.15405273, + "step": 3297, + "time_per_iteration": 2.599755048751831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086667, + "balance_loss_mlp": 1.07108665, + "epoch": 0.6344747979992305, + "flos": 572031714816.0, + "grad_norm": 0.09286327126840728, + "language_loss": 0.8328709, + "learning_rate": 0.0003113238520567964, + "loss": 0.8437376, + "num_input_tokens_seen": 274624944, + "router_z_loss_mlp": 0.15563965, + "step": 3298, + "time_per_iteration": 2.728113889694214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088611, + "balance_loss_mlp": 1.07316184, + "epoch": 0.634667179684494, + "flos": 605911601664.0, + "grad_norm": 0.09050699432092259, + "language_loss": 0.81456614, + "learning_rate": 0.00031103537878107403, + "loss": 0.82545221, + "num_input_tokens_seen": 274695152, + "router_z_loss_mlp": 0.15441895, + "step": 3299, + "time_per_iteration": 2.746675729751587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_mlp": 1.07576215, + "epoch": 0.6348595613697576, + "flos": 646944537600.0, + "grad_norm": 0.08418360382923895, + "language_loss": 0.7968322, + "learning_rate": 0.0003107469788679238, + "loss": 0.8077426, + "num_input_tokens_seen": 274767840, + "router_z_loss_mlp": 0.15246582, + "step": 3300, + "time_per_iteration": 2.7789735794067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086389, + "balance_loss_mlp": 1.07030737, + "epoch": 0.6350519430550212, + "flos": 639074935296.0, + "grad_norm": 0.07428233457329445, + "language_loss": 0.86447507, + "learning_rate": 0.00031045865242931267, + "loss": 0.87533897, + "num_input_tokens_seen": 274839312, + "router_z_loss_mlp": 0.1607666, + "step": 3301, + "time_per_iteration": 2.8069655895233154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096867, + "balance_loss_mlp": 1.08088112, + "epoch": 0.6352443247402847, + "flos": 686437908480.0, + "grad_norm": 0.07374364047073086, + "language_loss": 0.83124268, + "learning_rate": 0.00031017039957717877, + "loss": 0.84221137, + "num_input_tokens_seen": 274922704, + "router_z_loss_mlp": 0.15979004, + "step": 3302, + "time_per_iteration": 3.0203216075897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109177, + "balance_loss_mlp": 1.07607031, + "epoch": 0.6354367064255483, + "flos": 559442847744.0, + "grad_norm": 0.08011037824004849, + "language_loss": 0.88448334, + "learning_rate": 0.0003098822204234318, + "loss": 0.895401, + "num_input_tokens_seen": 274992848, + "router_z_loss_mlp": 0.15686035, + "step": 3303, + "time_per_iteration": 2.722560405731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086918, + "balance_loss_mlp": 1.07146788, + "epoch": 0.6356290881108119, + "flos": 979487520768.0, + "grad_norm": 0.14532397692109592, + "language_loss": 0.87361807, + "learning_rate": 0.00030959411507995273, + "loss": 0.88448727, + "num_input_tokens_seen": 275071456, + "router_z_loss_mlp": 0.15429688, + "step": 3304, + "time_per_iteration": 3.2270877361297607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109049, + "balance_loss_mlp": 1.07495642, + "epoch": 0.6358214697960755, + "flos": 528278298624.0, + "grad_norm": 0.07985404208202107, + "language_loss": 0.80787814, + "learning_rate": 0.00030930608365859407, + "loss": 0.8187831, + "num_input_tokens_seen": 275140512, + "router_z_loss_mlp": 0.15515137, + "step": 3305, + "time_per_iteration": 2.7090413570404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087839, + "balance_loss_mlp": 1.07174611, + "epoch": 0.6360138514813389, + "flos": 516811249152.0, + "grad_norm": 0.731689338993936, + "language_loss": 0.87885678, + "learning_rate": 0.00030901812627117943, + "loss": 0.88973522, + "num_input_tokens_seen": 275210896, + "router_z_loss_mlp": 0.16088867, + "step": 3306, + "time_per_iteration": 2.6327977180480957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077096, + "balance_loss_mlp": 1.06109858, + "epoch": 0.6362062331666025, + "flos": 466525163520.0, + "grad_norm": 0.09002939621512045, + "language_loss": 0.84808385, + "learning_rate": 0.000308730243029504, + "loss": 0.85885489, + "num_input_tokens_seen": 275279888, + "router_z_loss_mlp": 0.15979004, + "step": 3307, + "time_per_iteration": 2.6054556369781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088589, + "balance_loss_mlp": 1.07207811, + "epoch": 0.6363986148518661, + "flos": 549720193536.0, + "grad_norm": 0.0753497997145879, + "language_loss": 0.79653525, + "learning_rate": 0.0003084424340453339, + "loss": 0.80742109, + "num_input_tokens_seen": 275357056, + "router_z_loss_mlp": 0.16516113, + "step": 3308, + "time_per_iteration": 2.8042142391204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095017, + "balance_loss_mlp": 1.0775888, + "epoch": 0.6365909965371297, + "flos": 583049083392.0, + "grad_norm": 0.08328342026231418, + "language_loss": 0.82059419, + "learning_rate": 0.0003081546994304064, + "loss": 0.8315444, + "num_input_tokens_seen": 275428240, + "router_z_loss_mlp": 0.17443848, + "step": 3309, + "time_per_iteration": 2.7940802574157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100113, + "balance_loss_mlp": 1.08294737, + "epoch": 0.6367833782223933, + "flos": 531255739392.0, + "grad_norm": 0.07711723091328526, + "language_loss": 0.81634271, + "learning_rate": 0.0003078670392964298, + "loss": 0.82734382, + "num_input_tokens_seen": 275497568, + "router_z_loss_mlp": 0.17175293, + "step": 3310, + "time_per_iteration": 2.6288981437683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111543, + "balance_loss_mlp": 1.09684515, + "epoch": 0.6369757599076568, + "flos": 569506526208.0, + "grad_norm": 0.09648821040849707, + "language_loss": 0.83039993, + "learning_rate": 0.00030757945375508406, + "loss": 0.84155422, + "num_input_tokens_seen": 275569616, + "router_z_loss_mlp": 0.18591309, + "step": 3311, + "time_per_iteration": 2.680053472518921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120459, + "balance_loss_mlp": 1.10194564, + "epoch": 0.6371681415929203, + "flos": 539957892096.0, + "grad_norm": 0.07648325408881881, + "language_loss": 0.81110901, + "learning_rate": 0.00030729194291801944, + "loss": 0.82231361, + "num_input_tokens_seen": 275641408, + "router_z_loss_mlp": 0.18518066, + "step": 3312, + "time_per_iteration": 2.7345173358917236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124831, + "balance_loss_mlp": 1.10598445, + "epoch": 0.6373605232781839, + "flos": 483566423040.0, + "grad_norm": 0.1187576427749129, + "language_loss": 0.76967251, + "learning_rate": 0.00030700450689685787, + "loss": 0.78092086, + "num_input_tokens_seen": 275706608, + "router_z_loss_mlp": 0.18847656, + "step": 3313, + "time_per_iteration": 2.5925910472869873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134943, + "balance_loss_mlp": 1.11620378, + "epoch": 0.6375529049634475, + "flos": 578581636608.0, + "grad_norm": 0.086714433395562, + "language_loss": 0.85812229, + "learning_rate": 0.00030671714580319186, + "loss": 0.86947167, + "num_input_tokens_seen": 275785952, + "router_z_loss_mlp": 0.18762207, + "step": 3314, + "time_per_iteration": 2.8684160709381104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128682, + "balance_loss_mlp": 1.10954893, + "epoch": 0.637745286648711, + "flos": 682257530880.0, + "grad_norm": 0.07885995957457764, + "language_loss": 0.83140874, + "learning_rate": 0.0003064298597485846, + "loss": 0.84269553, + "num_input_tokens_seen": 275866240, + "router_z_loss_mlp": 0.19116211, + "step": 3315, + "time_per_iteration": 2.8987390995025635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122628, + "balance_loss_mlp": 1.10333991, + "epoch": 0.6379376683339746, + "flos": 504637558272.0, + "grad_norm": 0.08106722698037498, + "language_loss": 0.84028, + "learning_rate": 0.00030614264884457054, + "loss": 0.85150629, + "num_input_tokens_seen": 275936176, + "router_z_loss_mlp": 0.19274902, + "step": 3316, + "time_per_iteration": 2.671858787536621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112572, + "balance_loss_mlp": 1.09383273, + "epoch": 0.6381300500192382, + "flos": 502020965376.0, + "grad_norm": 0.09520385776828669, + "language_loss": 0.77556765, + "learning_rate": 0.000305855513202655, + "loss": 0.78669333, + "num_input_tokens_seen": 276004608, + "router_z_loss_mlp": 0.18725586, + "step": 3317, + "time_per_iteration": 2.6103365421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105702, + "balance_loss_mlp": 1.08714104, + "epoch": 0.6383224317045018, + "flos": 400489961472.0, + "grad_norm": 0.0870793394439323, + "language_loss": 0.77407163, + "learning_rate": 0.0003055684529343138, + "loss": 0.78512859, + "num_input_tokens_seen": 276066688, + "router_z_loss_mlp": 0.18566895, + "step": 3318, + "time_per_iteration": 2.4441628456115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104453, + "balance_loss_mlp": 1.08614254, + "epoch": 0.6385148133897653, + "flos": 499377208320.0, + "grad_norm": 0.09431837628284816, + "language_loss": 0.78623343, + "learning_rate": 0.00030528146815099374, + "loss": 0.79727793, + "num_input_tokens_seen": 276140000, + "router_z_loss_mlp": 0.1829834, + "step": 3319, + "time_per_iteration": 2.6380391120910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092806, + "balance_loss_mlp": 1.07459044, + "epoch": 0.6387071950750288, + "flos": 527665632768.0, + "grad_norm": 0.0775286688862043, + "language_loss": 0.7192508, + "learning_rate": 0.00030499455896411203, + "loss": 0.73017889, + "num_input_tokens_seen": 276209840, + "router_z_loss_mlp": 0.18225098, + "step": 3320, + "time_per_iteration": 2.6337239742279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146548, + "balance_loss_mlp": 1.13748848, + "epoch": 0.6388995767602924, + "flos": 1455979069440.0, + "grad_norm": 0.05026445046140725, + "language_loss": 0.76300812, + "learning_rate": 0.0003047077254850568, + "loss": 0.77447361, + "num_input_tokens_seen": 276444784, + "router_z_loss_mlp": 0.09082031, + "step": 3321, + "time_per_iteration": 4.989959239959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080858, + "balance_loss_mlp": 1.06314373, + "epoch": 0.639091958445556, + "flos": 603895191552.0, + "grad_norm": 0.29371403446084504, + "language_loss": 0.76736987, + "learning_rate": 0.0003044209678251865, + "loss": 0.77817845, + "num_input_tokens_seen": 276522768, + "router_z_loss_mlp": 0.17712402, + "step": 3322, + "time_per_iteration": 2.9107608795166016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_mlp": 1.05879879, + "epoch": 0.6392843401308196, + "flos": 584516694528.0, + "grad_norm": 0.07557324535671889, + "language_loss": 0.84569478, + "learning_rate": 0.0003041342860958306, + "loss": 0.85645002, + "num_input_tokens_seen": 276597104, + "router_z_loss_mlp": 0.1673584, + "step": 3323, + "time_per_iteration": 2.7665860652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010742, + "balance_loss_mlp": 1.0572598, + "epoch": 0.6394767218160831, + "flos": 514681413120.0, + "grad_norm": 0.11260284844343603, + "language_loss": 0.9165262, + "learning_rate": 0.00030384768040828857, + "loss": 0.92726815, + "num_input_tokens_seen": 276670256, + "router_z_loss_mlp": 0.16931152, + "step": 3324, + "time_per_iteration": 2.6840200424194336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075186, + "balance_loss_mlp": 1.05894923, + "epoch": 0.6396691035013466, + "flos": 541732022784.0, + "grad_norm": 0.08385815306502278, + "language_loss": 0.85726339, + "learning_rate": 0.00030356115087383094, + "loss": 0.86801529, + "num_input_tokens_seen": 276737680, + "router_z_loss_mlp": 0.16235352, + "step": 3325, + "time_per_iteration": 2.685962200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071194, + "balance_loss_mlp": 1.0543381, + "epoch": 0.6398614851866102, + "flos": 525535796736.0, + "grad_norm": 0.07882318349260847, + "language_loss": 0.85086048, + "learning_rate": 0.00030327469760369803, + "loss": 0.86157244, + "num_input_tokens_seen": 276803808, + "router_z_loss_mlp": 0.16870117, + "step": 3326, + "time_per_iteration": 2.5948264598846436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075003, + "balance_loss_mlp": 1.05855227, + "epoch": 0.6400538668718738, + "flos": 622989937152.0, + "grad_norm": 0.09362500195471922, + "language_loss": 0.84774464, + "learning_rate": 0.0003029883207091009, + "loss": 0.8584947, + "num_input_tokens_seen": 276874752, + "router_z_loss_mlp": 0.16455078, + "step": 3327, + "time_per_iteration": 2.7647178173065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080953, + "balance_loss_mlp": 1.06489587, + "epoch": 0.6402462485571374, + "flos": 503367436800.0, + "grad_norm": 0.0837002807607971, + "language_loss": 0.7833994, + "learning_rate": 0.00030270202030122095, + "loss": 0.794209, + "num_input_tokens_seen": 276947200, + "router_z_loss_mlp": 0.16052246, + "step": 3328, + "time_per_iteration": 2.6863620281219482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085596, + "balance_loss_mlp": 1.06944346, + "epoch": 0.6404386302424009, + "flos": 819247260672.0, + "grad_norm": 0.12091934143095992, + "language_loss": 0.86217034, + "learning_rate": 0.00030241579649121, + "loss": 0.87302625, + "num_input_tokens_seen": 277025712, + "router_z_loss_mlp": 0.16149902, + "step": 3329, + "time_per_iteration": 3.0689570903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094276, + "balance_loss_mlp": 1.07826567, + "epoch": 0.6406310119276645, + "flos": 471812677632.0, + "grad_norm": 0.07676724008110788, + "language_loss": 0.79411578, + "learning_rate": 0.00030212964939018994, + "loss": 0.80505848, + "num_input_tokens_seen": 277091264, + "router_z_loss_mlp": 0.16003418, + "step": 3330, + "time_per_iteration": 2.619704484939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106355, + "balance_loss_mlp": 1.09061956, + "epoch": 0.6408233936129281, + "flos": 425583631872.0, + "grad_norm": 0.1228216310287833, + "language_loss": 0.85246855, + "learning_rate": 0.0003018435791092527, + "loss": 0.86353219, + "num_input_tokens_seen": 277154608, + "router_z_loss_mlp": 0.15722656, + "step": 3331, + "time_per_iteration": 2.5062122344970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111787, + "balance_loss_mlp": 1.09569383, + "epoch": 0.6410157752981916, + "flos": 549784433664.0, + "grad_norm": 0.09250977947547825, + "language_loss": 0.80749196, + "learning_rate": 0.00030155758575946083, + "loss": 0.81860983, + "num_input_tokens_seen": 277222176, + "router_z_loss_mlp": 0.16088867, + "step": 3332, + "time_per_iteration": 2.649428129196167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126368, + "balance_loss_mlp": 1.11058497, + "epoch": 0.6412081569834551, + "flos": 475899452928.0, + "grad_norm": 0.08516597392533326, + "language_loss": 0.83713603, + "learning_rate": 0.0003012716694518467, + "loss": 0.8483997, + "num_input_tokens_seen": 277289600, + "router_z_loss_mlp": 0.15771484, + "step": 3333, + "time_per_iteration": 2.6135807037353516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132718, + "balance_loss_mlp": 1.11691082, + "epoch": 0.6414005386687187, + "flos": 540921494016.0, + "grad_norm": 0.07646899423626412, + "language_loss": 0.85365057, + "learning_rate": 0.000300985830297413, + "loss": 0.86497772, + "num_input_tokens_seen": 277362784, + "router_z_loss_mlp": 0.15795898, + "step": 3334, + "time_per_iteration": 2.7174272537231445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139874, + "balance_loss_mlp": 1.12411475, + "epoch": 0.6415929203539823, + "flos": 1041317379072.0, + "grad_norm": 0.09796479304717164, + "language_loss": 0.87037742, + "learning_rate": 0.00030070006840713205, + "loss": 0.88177609, + "num_input_tokens_seen": 277449728, + "router_z_loss_mlp": 0.1574707, + "step": 3335, + "time_per_iteration": 3.4015066623687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143096, + "balance_loss_mlp": 1.12753963, + "epoch": 0.6417853020392459, + "flos": 648337996800.0, + "grad_norm": 0.08601362960013602, + "language_loss": 0.73407865, + "learning_rate": 0.000300414383891947, + "loss": 0.74550962, + "num_input_tokens_seen": 277527552, + "router_z_loss_mlp": 0.15539551, + "step": 3336, + "time_per_iteration": 2.889427661895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147858, + "balance_loss_mlp": 1.13256359, + "epoch": 0.6419776837245095, + "flos": 500899147776.0, + "grad_norm": 0.06457734874365277, + "language_loss": 0.88771486, + "learning_rate": 0.00030012877686276973, + "loss": 0.89919341, + "num_input_tokens_seen": 277603568, + "router_z_loss_mlp": 0.15270996, + "step": 3337, + "time_per_iteration": 2.7751049995422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149838, + "balance_loss_mlp": 1.13451934, + "epoch": 0.642170065409773, + "flos": 620620392960.0, + "grad_norm": 0.07413872787438813, + "language_loss": 0.86947334, + "learning_rate": 0.0002998432474304832, + "loss": 0.88097167, + "num_input_tokens_seen": 277679696, + "router_z_loss_mlp": 0.1529541, + "step": 3338, + "time_per_iteration": 2.8443615436553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066826, + "balance_loss_mlp": 1.05876791, + "epoch": 0.6423624470950365, + "flos": 1423539629568.0, + "grad_norm": 0.0235298997703447, + "language_loss": 0.79237342, + "learning_rate": 0.0002995577957059395, + "loss": 0.8030417, + "num_input_tokens_seen": 277913056, + "router_z_loss_mlp": 0.08056641, + "step": 3339, + "time_per_iteration": 4.873133659362793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146518, + "balance_loss_mlp": 1.13105643, + "epoch": 0.6425548287803001, + "flos": 562353477120.0, + "grad_norm": 0.06001199999117321, + "language_loss": 0.88487816, + "learning_rate": 0.00029927242179996107, + "loss": 0.89634329, + "num_input_tokens_seen": 277983168, + "router_z_loss_mlp": 0.15441895, + "step": 3340, + "time_per_iteration": 2.7014224529266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144782, + "balance_loss_mlp": 1.12893939, + "epoch": 0.6427472104655637, + "flos": 585443220480.0, + "grad_norm": 0.0682782247360197, + "language_loss": 0.83006454, + "learning_rate": 0.0002989871258233398, + "loss": 0.84151232, + "num_input_tokens_seen": 278057600, + "router_z_loss_mlp": 0.1583252, + "step": 3341, + "time_per_iteration": 2.7891581058502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137179, + "balance_loss_mlp": 1.12174153, + "epoch": 0.6429395921508272, + "flos": 404282700288.0, + "grad_norm": 0.10902678914385976, + "language_loss": 0.82279134, + "learning_rate": 0.0002987019078868373, + "loss": 0.83416307, + "num_input_tokens_seen": 278119232, + "router_z_loss_mlp": 0.15429688, + "step": 3342, + "time_per_iteration": 2.4937355518341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135998, + "balance_loss_mlp": 1.12050128, + "epoch": 0.6431319738360908, + "flos": 548783755776.0, + "grad_norm": 0.07048504684512738, + "language_loss": 0.81617045, + "learning_rate": 0.00029841676810118484, + "loss": 0.8275305, + "num_input_tokens_seen": 278187456, + "router_z_loss_mlp": 0.15478516, + "step": 3343, + "time_per_iteration": 2.721240997314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011253, + "balance_loss_mlp": 1.10925424, + "epoch": 0.6433243555213544, + "flos": 793375368192.0, + "grad_norm": 0.08414428374798259, + "language_loss": 0.87345612, + "learning_rate": 0.0002981317065770839, + "loss": 0.88470906, + "num_input_tokens_seen": 278262176, + "router_z_loss_mlp": 0.16040039, + "step": 3344, + "time_per_iteration": 3.06880521774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120449, + "balance_loss_mlp": 1.10509467, + "epoch": 0.643516737206618, + "flos": 583031831040.0, + "grad_norm": 0.10046839688715496, + "language_loss": 0.80932879, + "learning_rate": 0.00029784672342520493, + "loss": 0.82053328, + "num_input_tokens_seen": 278328816, + "router_z_loss_mlp": 0.15332031, + "step": 3345, + "time_per_iteration": 2.71240496635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118949, + "balance_loss_mlp": 1.10347569, + "epoch": 0.6437091188918815, + "flos": 518750936064.0, + "grad_norm": 0.10277118119313504, + "language_loss": 0.8364169, + "learning_rate": 0.00029756181875618834, + "loss": 0.84760636, + "num_input_tokens_seen": 278395824, + "router_z_loss_mlp": 0.15454102, + "step": 3346, + "time_per_iteration": 2.589006185531616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110068, + "balance_loss_mlp": 1.09397459, + "epoch": 0.643901500577145, + "flos": 384946048512.0, + "grad_norm": 0.07329616069241408, + "language_loss": 0.83350551, + "learning_rate": 0.0002972769926806439, + "loss": 0.84460616, + "num_input_tokens_seen": 278457696, + "router_z_loss_mlp": 0.16088867, + "step": 3347, + "time_per_iteration": 2.4795889854431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102586, + "balance_loss_mlp": 1.08667207, + "epoch": 0.6440938822624086, + "flos": 483722067456.0, + "grad_norm": 0.08705096327396913, + "language_loss": 0.88483214, + "learning_rate": 0.0002969922453091508, + "loss": 0.89585805, + "num_input_tokens_seen": 278526992, + "router_z_loss_mlp": 0.15905762, + "step": 3348, + "time_per_iteration": 2.6210718154907227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.08181691, + "epoch": 0.6442862639476722, + "flos": 540469241856.0, + "grad_norm": 0.07238968090478194, + "language_loss": 0.85106307, + "learning_rate": 0.00029670757675225777, + "loss": 0.86203837, + "num_input_tokens_seen": 278601120, + "router_z_loss_mlp": 0.15698242, + "step": 3349, + "time_per_iteration": 2.7721433639526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110462, + "balance_loss_mlp": 1.08850336, + "epoch": 0.6444786456329358, + "flos": 526912003584.0, + "grad_norm": 0.07515890513129632, + "language_loss": 0.79165089, + "learning_rate": 0.0002964229871204831, + "loss": 0.80269712, + "num_input_tokens_seen": 278668208, + "router_z_loss_mlp": 0.16113281, + "step": 3350, + "time_per_iteration": 2.6707816123962402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095562, + "balance_loss_mlp": 1.0801599, + "epoch": 0.6446710273181993, + "flos": 697892848128.0, + "grad_norm": 0.08798444553042223, + "language_loss": 0.83359706, + "learning_rate": 0.00029613847652431403, + "loss": 0.84455276, + "num_input_tokens_seen": 278742832, + "router_z_loss_mlp": 0.15380859, + "step": 3351, + "time_per_iteration": 3.076526403427124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097548, + "balance_loss_mlp": 1.08212233, + "epoch": 0.6448634090034628, + "flos": 625023226368.0, + "grad_norm": 0.07638162033625162, + "language_loss": 0.79389453, + "learning_rate": 0.0002958540450742078, + "loss": 0.80487001, + "num_input_tokens_seen": 278829744, + "router_z_loss_mlp": 0.15405273, + "step": 3352, + "time_per_iteration": 2.9522883892059326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088563, + "balance_loss_mlp": 1.0726012, + "epoch": 0.6450557906887264, + "flos": 600950057472.0, + "grad_norm": 0.0756305542343249, + "language_loss": 0.77240932, + "learning_rate": 0.0002955696928805901, + "loss": 0.78329492, + "num_input_tokens_seen": 278908592, + "router_z_loss_mlp": 0.1595459, + "step": 3353, + "time_per_iteration": 2.9094340801239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085355, + "balance_loss_mlp": 1.06963158, + "epoch": 0.64524817237399, + "flos": 646200820224.0, + "grad_norm": 0.06844554390728431, + "language_loss": 0.85999632, + "learning_rate": 0.0002952854200538563, + "loss": 0.87084985, + "num_input_tokens_seen": 278986960, + "router_z_loss_mlp": 0.15710449, + "step": 3354, + "time_per_iteration": 2.820434808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084907, + "balance_loss_mlp": 1.06869507, + "epoch": 0.6454405540592536, + "flos": 473411340288.0, + "grad_norm": 0.06913920875514136, + "language_loss": 0.81889141, + "learning_rate": 0.000295001226704371, + "loss": 0.82974052, + "num_input_tokens_seen": 279054896, + "router_z_loss_mlp": 0.16210938, + "step": 3355, + "time_per_iteration": 2.5986335277557373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108489, + "balance_loss_mlp": 1.06890416, + "epoch": 0.6456329357445171, + "flos": 611841517056.0, + "grad_norm": 0.09885460373784571, + "language_loss": 0.82614869, + "learning_rate": 0.00029471711294246783, + "loss": 0.83699757, + "num_input_tokens_seen": 279126816, + "router_z_loss_mlp": 0.15979004, + "step": 3356, + "time_per_iteration": 2.7818820476531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108141, + "balance_loss_mlp": 1.06516171, + "epoch": 0.6458253174297807, + "flos": 731683901952.0, + "grad_norm": 0.08380792683960937, + "language_loss": 0.8265574, + "learning_rate": 0.0002944330788784494, + "loss": 0.83737159, + "num_input_tokens_seen": 279197552, + "router_z_loss_mlp": 0.16247559, + "step": 3357, + "time_per_iteration": 2.8812832832336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.07064033, + "epoch": 0.6460176991150443, + "flos": 570413228544.0, + "grad_norm": 0.06446449543464593, + "language_loss": 0.84539986, + "learning_rate": 0.00029414912462258786, + "loss": 0.85626698, + "num_input_tokens_seen": 279275440, + "router_z_loss_mlp": 0.16064453, + "step": 3358, + "time_per_iteration": 2.8379344940185547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083974, + "balance_loss_mlp": 1.06774938, + "epoch": 0.6462100808003078, + "flos": 583160311296.0, + "grad_norm": 0.10894531444505327, + "language_loss": 0.81342053, + "learning_rate": 0.00029386525028512366, + "loss": 0.82426023, + "num_input_tokens_seen": 279349168, + "router_z_loss_mlp": 0.16223145, + "step": 3359, + "time_per_iteration": 2.7395105361938477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085698, + "balance_loss_mlp": 1.06923473, + "epoch": 0.6464024624855714, + "flos": 483919557120.0, + "grad_norm": 0.0747784188423731, + "language_loss": 0.87245089, + "learning_rate": 0.0002935814559762666, + "loss": 0.88330787, + "num_input_tokens_seen": 279427600, + "router_z_loss_mlp": 0.16467285, + "step": 3360, + "time_per_iteration": 2.784308433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085107, + "balance_loss_mlp": 1.0693121, + "epoch": 0.6465948441708349, + "flos": 527774289408.0, + "grad_norm": 0.08701816343454002, + "language_loss": 0.79713386, + "learning_rate": 0.0002932977418061957, + "loss": 0.80798495, + "num_input_tokens_seen": 279496608, + "router_z_loss_mlp": 0.15783691, + "step": 3361, + "time_per_iteration": 2.6461353302001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085334, + "balance_loss_mlp": 1.06910968, + "epoch": 0.6467872258560985, + "flos": 669421615104.0, + "grad_norm": 0.07872387718788462, + "language_loss": 0.80426037, + "learning_rate": 0.00029301410788505833, + "loss": 0.81511372, + "num_input_tokens_seen": 279568448, + "router_z_loss_mlp": 0.16223145, + "step": 3362, + "time_per_iteration": 2.813366413116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092118, + "balance_loss_mlp": 1.07586968, + "epoch": 0.6469796075413621, + "flos": 432101620224.0, + "grad_norm": 0.12115890254609105, + "language_loss": 0.80987, + "learning_rate": 0.00029273055432297126, + "loss": 0.82079118, + "num_input_tokens_seen": 279631952, + "router_z_loss_mlp": 0.16247559, + "step": 3363, + "time_per_iteration": 2.511802911758423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084973, + "balance_loss_mlp": 1.06877291, + "epoch": 0.6471719892266257, + "flos": 803750335488.0, + "grad_norm": 0.06785413564758717, + "language_loss": 0.80699545, + "learning_rate": 0.00029244708123001917, + "loss": 0.81784511, + "num_input_tokens_seen": 279706880, + "router_z_loss_mlp": 0.1619873, + "step": 3364, + "time_per_iteration": 2.980236768722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082898, + "balance_loss_mlp": 1.06629229, + "epoch": 0.6473643709118891, + "flos": 577208001024.0, + "grad_norm": 0.06532727643194694, + "language_loss": 0.84224701, + "learning_rate": 0.0002921636887162565, + "loss": 0.85307598, + "num_input_tokens_seen": 279778864, + "router_z_loss_mlp": 0.16601562, + "step": 3365, + "time_per_iteration": 2.7428975105285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108053, + "balance_loss_mlp": 1.06452012, + "epoch": 0.6475567525971527, + "flos": 761420113920.0, + "grad_norm": 0.08750639316887465, + "language_loss": 0.83789468, + "learning_rate": 0.00029188037689170595, + "loss": 0.84869999, + "num_input_tokens_seen": 279853328, + "router_z_loss_mlp": 0.16015625, + "step": 3366, + "time_per_iteration": 2.982468843460083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088124, + "balance_loss_mlp": 1.07188749, + "epoch": 0.6477491342824163, + "flos": 843103116288.0, + "grad_norm": 0.08320193345664485, + "language_loss": 0.84052682, + "learning_rate": 0.0002915971458663586, + "loss": 0.85140812, + "num_input_tokens_seen": 279928464, + "router_z_loss_mlp": 0.16235352, + "step": 3367, + "time_per_iteration": 3.065324544906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083268, + "balance_loss_mlp": 1.06736565, + "epoch": 0.6479415159676799, + "flos": 884820298752.0, + "grad_norm": 0.05677832621363699, + "language_loss": 0.81828123, + "learning_rate": 0.00029131399575017494, + "loss": 0.82911396, + "num_input_tokens_seen": 280015680, + "router_z_loss_mlp": 0.15893555, + "step": 3368, + "time_per_iteration": 3.181908130645752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079967, + "balance_loss_mlp": 1.06356418, + "epoch": 0.6481338976529435, + "flos": 615513116160.0, + "grad_norm": 0.06942657132452239, + "language_loss": 0.85825574, + "learning_rate": 0.0002910309266530836, + "loss": 0.86905545, + "num_input_tokens_seen": 280093904, + "router_z_loss_mlp": 0.16394043, + "step": 3369, + "time_per_iteration": 2.853720188140869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088509, + "balance_loss_mlp": 1.07184362, + "epoch": 0.648326279338207, + "flos": 510009136128.0, + "grad_norm": 0.07294925801459864, + "language_loss": 0.85398757, + "learning_rate": 0.0002907479386849814, + "loss": 0.86487263, + "num_input_tokens_seen": 280161584, + "router_z_loss_mlp": 0.16674805, + "step": 3370, + "time_per_iteration": 2.6536483764648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087231, + "balance_loss_mlp": 1.07115006, + "epoch": 0.6485186610234706, + "flos": 702498313728.0, + "grad_norm": 0.07555767339511835, + "language_loss": 0.80052334, + "learning_rate": 0.0002904650319557339, + "loss": 0.81139565, + "num_input_tokens_seen": 280248016, + "router_z_loss_mlp": 0.1607666, + "step": 3371, + "time_per_iteration": 2.996121644973755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109038, + "balance_loss_mlp": 1.07432294, + "epoch": 0.6487110427087341, + "flos": 560683233792.0, + "grad_norm": 0.08951276836264582, + "language_loss": 0.80922645, + "learning_rate": 0.0002901822065751758, + "loss": 0.82013029, + "num_input_tokens_seen": 280319024, + "router_z_loss_mlp": 0.16052246, + "step": 3372, + "time_per_iteration": 2.6797337532043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091281, + "balance_loss_mlp": 1.07499671, + "epoch": 0.6489034243939977, + "flos": 680100530688.0, + "grad_norm": 0.07760680583189275, + "language_loss": 0.85333431, + "learning_rate": 0.0002898994626531093, + "loss": 0.86424708, + "num_input_tokens_seen": 280393200, + "router_z_loss_mlp": 0.1628418, + "step": 3373, + "time_per_iteration": 2.9142796993255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096791, + "balance_loss_mlp": 1.08061421, + "epoch": 0.6490958060792612, + "flos": 474412018176.0, + "grad_norm": 0.08009556934664804, + "language_loss": 0.87685299, + "learning_rate": 0.00028961680029930526, + "loss": 0.88782084, + "num_input_tokens_seen": 280456944, + "router_z_loss_mlp": 0.16174316, + "step": 3374, + "time_per_iteration": 2.6317813396453857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093542, + "balance_loss_mlp": 1.07774687, + "epoch": 0.6492881877645248, + "flos": 588850518528.0, + "grad_norm": 0.10403413737610764, + "language_loss": 0.76691556, + "learning_rate": 0.00028933421962350317, + "loss": 0.77785093, + "num_input_tokens_seen": 280534352, + "router_z_loss_mlp": 0.15783691, + "step": 3375, + "time_per_iteration": 2.74595046043396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100726, + "balance_loss_mlp": 1.08450198, + "epoch": 0.6494805694497884, + "flos": 642427905024.0, + "grad_norm": 0.07424311731370936, + "language_loss": 0.83669561, + "learning_rate": 0.0002890517207354104, + "loss": 0.84770286, + "num_input_tokens_seen": 280608912, + "router_z_loss_mlp": 0.16223145, + "step": 3376, + "time_per_iteration": 2.8577523231506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099818, + "balance_loss_mlp": 1.08361781, + "epoch": 0.649672951135052, + "flos": 531806736384.0, + "grad_norm": 0.06432673678328359, + "language_loss": 0.81672311, + "learning_rate": 0.0002887693037447029, + "loss": 0.8277213, + "num_input_tokens_seen": 280678848, + "router_z_loss_mlp": 0.1619873, + "step": 3377, + "time_per_iteration": 2.6683990955352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105741, + "balance_loss_mlp": 1.08960009, + "epoch": 0.6498653328203156, + "flos": 547387725312.0, + "grad_norm": 0.07747237363715123, + "language_loss": 0.81861436, + "learning_rate": 0.00028848696876102443, + "loss": 0.8296718, + "num_input_tokens_seen": 280750224, + "router_z_loss_mlp": 0.16137695, + "step": 3378, + "time_per_iteration": 2.6596877574920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104851, + "balance_loss_mlp": 1.08822083, + "epoch": 0.650057714505579, + "flos": 462228415488.0, + "grad_norm": 0.07839901115020462, + "language_loss": 0.83414477, + "learning_rate": 0.00028820471589398723, + "loss": 0.84519327, + "num_input_tokens_seen": 280817488, + "router_z_loss_mlp": 0.16638184, + "step": 3379, + "time_per_iteration": 2.5814483165740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097332, + "balance_loss_mlp": 1.08126247, + "epoch": 0.6502500961908426, + "flos": 510172121088.0, + "grad_norm": 0.08703202670107367, + "language_loss": 0.77904689, + "learning_rate": 0.00028792254525317196, + "loss": 0.79002023, + "num_input_tokens_seen": 280887440, + "router_z_loss_mlp": 0.16064453, + "step": 3380, + "time_per_iteration": 2.72318434715271 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097321, + "balance_loss_mlp": 1.08113241, + "epoch": 0.6504424778761062, + "flos": 579827165184.0, + "grad_norm": 0.1026330165039415, + "language_loss": 0.81341857, + "learning_rate": 0.00028764045694812645, + "loss": 0.82439172, + "num_input_tokens_seen": 280959072, + "router_z_loss_mlp": 0.16186523, + "step": 3381, + "time_per_iteration": 2.8238747119903564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094282, + "balance_loss_mlp": 1.07821298, + "epoch": 0.6506348595613698, + "flos": 519457577472.0, + "grad_norm": 0.08728487629986259, + "language_loss": 0.76526588, + "learning_rate": 0.0002873584510883671, + "loss": 0.7762087, + "num_input_tokens_seen": 281025376, + "router_z_loss_mlp": 0.16064453, + "step": 3382, + "time_per_iteration": 2.577808380126953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093616, + "balance_loss_mlp": 1.07766557, + "epoch": 0.6508272412466333, + "flos": 510310513152.0, + "grad_norm": 0.0816081367249066, + "language_loss": 0.8610574, + "learning_rate": 0.0002870765277833788, + "loss": 0.87199354, + "num_input_tokens_seen": 281097616, + "router_z_loss_mlp": 0.15942383, + "step": 3383, + "time_per_iteration": 2.694575071334839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084046, + "balance_loss_mlp": 1.06782174, + "epoch": 0.6510196229318969, + "flos": 625623782400.0, + "grad_norm": 0.07010273860249229, + "language_loss": 0.80279285, + "learning_rate": 0.00028679468714261347, + "loss": 0.81363332, + "num_input_tokens_seen": 281170192, + "router_z_loss_mlp": 0.16210938, + "step": 3384, + "time_per_iteration": 2.778613805770874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108697, + "balance_loss_mlp": 1.07105613, + "epoch": 0.6512120046171604, + "flos": 474696142848.0, + "grad_norm": 0.08179084803360179, + "language_loss": 0.76861918, + "learning_rate": 0.0002865129292754918, + "loss": 0.77948892, + "num_input_tokens_seen": 281238832, + "router_z_loss_mlp": 0.15905762, + "step": 3385, + "time_per_iteration": 2.579616069793701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089266, + "balance_loss_mlp": 1.07348299, + "epoch": 0.651404386302424, + "flos": 551854798848.0, + "grad_norm": 0.07682712514001584, + "language_loss": 0.81799757, + "learning_rate": 0.00028623125429140105, + "loss": 0.82889026, + "num_input_tokens_seen": 281319472, + "router_z_loss_mlp": 0.15771484, + "step": 3386, + "time_per_iteration": 2.867527961730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088264, + "balance_loss_mlp": 1.07212317, + "epoch": 0.6515967679876876, + "flos": 523311985152.0, + "grad_norm": 0.08394692910439203, + "language_loss": 0.86795825, + "learning_rate": 0.00028594966229969785, + "loss": 0.8788408, + "num_input_tokens_seen": 281391168, + "router_z_loss_mlp": 0.16137695, + "step": 3387, + "time_per_iteration": 2.729546546936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090562, + "balance_loss_mlp": 1.07511222, + "epoch": 0.6517891496729511, + "flos": 573874854912.0, + "grad_norm": 0.06943959400657512, + "language_loss": 0.80838251, + "learning_rate": 0.00028566815340970577, + "loss": 0.81928813, + "num_input_tokens_seen": 281465664, + "router_z_loss_mlp": 0.15429688, + "step": 3388, + "time_per_iteration": 2.7792935371398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087811, + "balance_loss_mlp": 1.07234919, + "epoch": 0.6519815313582147, + "flos": 555926893056.0, + "grad_norm": 0.10254381492553745, + "language_loss": 0.80897045, + "learning_rate": 0.0002853867277307162, + "loss": 0.81984854, + "num_input_tokens_seen": 281532928, + "router_z_loss_mlp": 0.15441895, + "step": 3389, + "time_per_iteration": 2.6666784286499023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087817, + "balance_loss_mlp": 1.07184315, + "epoch": 0.6521739130434783, + "flos": 480487666176.0, + "grad_norm": 0.06532744179053884, + "language_loss": 0.82385129, + "learning_rate": 0.00028510538537198824, + "loss": 0.83472943, + "num_input_tokens_seen": 281601680, + "router_z_loss_mlp": 0.15966797, + "step": 3390, + "time_per_iteration": 2.6052095890045166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088167, + "balance_loss_mlp": 1.0724194, + "epoch": 0.6523662947287419, + "flos": 665707797504.0, + "grad_norm": 0.0671667027418021, + "language_loss": 0.86489713, + "learning_rate": 0.00028482412644274867, + "loss": 0.87577885, + "num_input_tokens_seen": 281679488, + "router_z_loss_mlp": 0.15734863, + "step": 3391, + "time_per_iteration": 2.987199068069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086228, + "balance_loss_mlp": 1.07036138, + "epoch": 0.6525586764140053, + "flos": 548655275520.0, + "grad_norm": 0.07499708088395778, + "language_loss": 0.74256724, + "learning_rate": 0.00028454295105219207, + "loss": 0.75342953, + "num_input_tokens_seen": 281751056, + "router_z_loss_mlp": 0.15856934, + "step": 3392, + "time_per_iteration": 2.6695480346679688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087072, + "balance_loss_mlp": 1.07157481, + "epoch": 0.6527510580992689, + "flos": 802900159488.0, + "grad_norm": 0.1096122029858208, + "language_loss": 0.79101622, + "learning_rate": 0.0002842618593094802, + "loss": 0.80188692, + "num_input_tokens_seen": 281841008, + "router_z_loss_mlp": 0.15478516, + "step": 3393, + "time_per_iteration": 3.142174005508423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091326, + "balance_loss_mlp": 1.07560241, + "epoch": 0.6529434397845325, + "flos": 671166010368.0, + "grad_norm": 0.19909824344708926, + "language_loss": 0.80162621, + "learning_rate": 0.00028398085132374243, + "loss": 0.81253946, + "num_input_tokens_seen": 281908016, + "router_z_loss_mlp": 0.15710449, + "step": 3394, + "time_per_iteration": 2.806171178817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088691, + "balance_loss_mlp": 1.0730865, + "epoch": 0.6531358214697961, + "flos": 828409006080.0, + "grad_norm": 0.0722395804995529, + "language_loss": 0.84118348, + "learning_rate": 0.0002836999272040761, + "loss": 0.85207039, + "num_input_tokens_seen": 281989072, + "router_z_loss_mlp": 0.15588379, + "step": 3395, + "time_per_iteration": 3.128824472427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084488, + "balance_loss_mlp": 1.06822813, + "epoch": 0.6533282031550597, + "flos": 487403578368.0, + "grad_norm": 0.09318824389102451, + "language_loss": 0.84164572, + "learning_rate": 0.00028341908705954575, + "loss": 0.85249066, + "num_input_tokens_seen": 282053152, + "router_z_loss_mlp": 0.16259766, + "step": 3396, + "time_per_iteration": 2.557788848876953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_mlp": 1.02878892, + "epoch": 0.6535205848403232, + "flos": 1557744638976.0, + "grad_norm": 0.02499747556734328, + "language_loss": 0.81761813, + "learning_rate": 0.00028313833099918265, + "loss": 0.82797897, + "num_input_tokens_seen": 282283984, + "router_z_loss_mlp": 0.07275391, + "step": 3397, + "time_per_iteration": 4.86290979385376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_mlp": 1.07970405, + "epoch": 0.6537129665255867, + "flos": 493711593984.0, + "grad_norm": 0.0751394003392184, + "language_loss": 0.78380162, + "learning_rate": 0.00028285765913198604, + "loss": 0.7947588, + "num_input_tokens_seen": 282353008, + "router_z_loss_mlp": 0.16003418, + "step": 3398, + "time_per_iteration": 2.600771903991699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098784, + "balance_loss_mlp": 1.08344197, + "epoch": 0.6539053482108503, + "flos": 605002328064.0, + "grad_norm": 0.09108234691861208, + "language_loss": 0.81861216, + "learning_rate": 0.0002825770715669227, + "loss": 0.82960004, + "num_input_tokens_seen": 282427648, + "router_z_loss_mlp": 0.15319824, + "step": 3399, + "time_per_iteration": 2.737407684326172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109606, + "balance_loss_mlp": 1.08050275, + "epoch": 0.6540977298961139, + "flos": 577778821632.0, + "grad_norm": 0.06648810793188806, + "language_loss": 0.81205964, + "learning_rate": 0.00028229656841292634, + "loss": 0.82302022, + "num_input_tokens_seen": 282502128, + "router_z_loss_mlp": 0.15539551, + "step": 3400, + "time_per_iteration": 2.772174596786499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094687, + "balance_loss_mlp": 1.07884359, + "epoch": 0.6542901115813774, + "flos": 511753531392.0, + "grad_norm": 0.09125126415634116, + "language_loss": 0.76617396, + "learning_rate": 0.0002820161497788979, + "loss": 0.77712083, + "num_input_tokens_seen": 282569360, + "router_z_loss_mlp": 0.1583252, + "step": 3401, + "time_per_iteration": 2.5956950187683105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097852, + "balance_loss_mlp": 1.08250988, + "epoch": 0.654482493266641, + "flos": 625495302144.0, + "grad_norm": 0.07069704571698167, + "language_loss": 0.8703959, + "learning_rate": 0.00028173581577370545, + "loss": 0.88137436, + "num_input_tokens_seen": 282645472, + "router_z_loss_mlp": 0.15332031, + "step": 3402, + "time_per_iteration": 2.781916379928589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092911, + "balance_loss_mlp": 1.0775454, + "epoch": 0.6546748749519046, + "flos": 523981550592.0, + "grad_norm": 0.07155783234784782, + "language_loss": 0.7877273, + "learning_rate": 0.0002814555665061844, + "loss": 0.7986564, + "num_input_tokens_seen": 282717568, + "router_z_loss_mlp": 0.15356445, + "step": 3403, + "time_per_iteration": 2.6751809120178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097131, + "balance_loss_mlp": 1.08174062, + "epoch": 0.6548672566371682, + "flos": 479210204160.0, + "grad_norm": 0.0786668286900623, + "language_loss": 0.77486473, + "learning_rate": 0.00028117540208513715, + "loss": 0.78583604, + "num_input_tokens_seen": 282791408, + "router_z_loss_mlp": 0.15368652, + "step": 3404, + "time_per_iteration": 2.749115228652954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102022, + "balance_loss_mlp": 1.08668029, + "epoch": 0.6550596383224317, + "flos": 616012356096.0, + "grad_norm": 0.08182139934460984, + "language_loss": 0.84582227, + "learning_rate": 0.00028089532261933313, + "loss": 0.85684246, + "num_input_tokens_seen": 282862992, + "router_z_loss_mlp": 0.15319824, + "step": 3405, + "time_per_iteration": 2.7621490955352783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103666, + "balance_loss_mlp": 1.08819294, + "epoch": 0.6552520200076952, + "flos": 488836684800.0, + "grad_norm": 0.09214410473425906, + "language_loss": 0.85497427, + "learning_rate": 0.0002806153282175087, + "loss": 0.8660109, + "num_input_tokens_seen": 282930448, + "router_z_loss_mlp": 0.15454102, + "step": 3406, + "time_per_iteration": 2.554258346557617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106093, + "balance_loss_mlp": 1.09101284, + "epoch": 0.6554444016929588, + "flos": 687619196928.0, + "grad_norm": 0.08390483637961621, + "language_loss": 0.8305704, + "learning_rate": 0.0002803354189883679, + "loss": 0.84163129, + "num_input_tokens_seen": 283010864, + "router_z_loss_mlp": 0.15063477, + "step": 3407, + "time_per_iteration": 2.8450276851654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106209, + "balance_loss_mlp": 1.09085512, + "epoch": 0.6556367833782224, + "flos": 543051330048.0, + "grad_norm": 0.0734148655428184, + "language_loss": 0.85336381, + "learning_rate": 0.00028005559504058053, + "loss": 0.8644259, + "num_input_tokens_seen": 283082240, + "router_z_loss_mlp": 0.15332031, + "step": 3408, + "time_per_iteration": 2.7837629318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093577, + "balance_loss_mlp": 1.07772207, + "epoch": 0.655829165063486, + "flos": 673535554560.0, + "grad_norm": 0.16856049180871682, + "language_loss": 0.76636934, + "learning_rate": 0.0002797758564827838, + "loss": 0.77730507, + "num_input_tokens_seen": 283156656, + "router_z_loss_mlp": 0.15844727, + "step": 3409, + "time_per_iteration": 2.822136402130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110799, + "balance_loss_mlp": 1.09300518, + "epoch": 0.6560215467487496, + "flos": 531806736384.0, + "grad_norm": 0.07117651183978699, + "language_loss": 0.83604807, + "learning_rate": 0.0002794962034235824, + "loss": 0.84712797, + "num_input_tokens_seen": 283223584, + "router_z_loss_mlp": 0.14953613, + "step": 3410, + "time_per_iteration": 2.66369891166687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099109, + "balance_loss_mlp": 1.08368373, + "epoch": 0.656213928434013, + "flos": 591311467008.0, + "grad_norm": 0.08491575127414114, + "language_loss": 0.7504462, + "learning_rate": 0.00027921663597154695, + "loss": 0.7614373, + "num_input_tokens_seen": 283297680, + "router_z_loss_mlp": 0.15405273, + "step": 3411, + "time_per_iteration": 2.7551727294921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092638, + "balance_loss_mlp": 1.07776022, + "epoch": 0.6564063101192766, + "flos": 415786825728.0, + "grad_norm": 0.11157782453846309, + "language_loss": 0.8107205, + "learning_rate": 0.00027893715423521525, + "loss": 0.82164693, + "num_input_tokens_seen": 283359744, + "router_z_loss_mlp": 0.14868164, + "step": 3412, + "time_per_iteration": 2.472046375274658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091359, + "balance_loss_mlp": 1.07526577, + "epoch": 0.6565986918045402, + "flos": 453321059328.0, + "grad_norm": 0.09147019628997788, + "language_loss": 0.83780456, + "learning_rate": 0.00027865775832309163, + "loss": 0.84871817, + "num_input_tokens_seen": 283430688, + "router_z_loss_mlp": 0.16088867, + "step": 3413, + "time_per_iteration": 2.66375470161438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095598, + "balance_loss_mlp": 1.08004141, + "epoch": 0.6567910734898038, + "flos": 547746001920.0, + "grad_norm": 0.06942494877025387, + "language_loss": 0.86058021, + "learning_rate": 0.00027837844834364733, + "loss": 0.8715362, + "num_input_tokens_seen": 283498048, + "router_z_loss_mlp": 0.15539551, + "step": 3414, + "time_per_iteration": 2.664980173110962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_mlp": 1.06337631, + "epoch": 0.6569834551750673, + "flos": 655518210048.0, + "grad_norm": 0.06137400431250788, + "language_loss": 0.86412358, + "learning_rate": 0.00027809922440532, + "loss": 0.87491024, + "num_input_tokens_seen": 283573040, + "router_z_loss_mlp": 0.15270996, + "step": 3415, + "time_per_iteration": 2.8617782592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078566, + "balance_loss_mlp": 1.06294966, + "epoch": 0.6571758368603309, + "flos": 539681107968.0, + "grad_norm": 0.07494876333939839, + "language_loss": 0.80630636, + "learning_rate": 0.00027782008661651406, + "loss": 0.81709206, + "num_input_tokens_seen": 283651696, + "router_z_loss_mlp": 0.15600586, + "step": 3416, + "time_per_iteration": 2.7988359928131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079692, + "balance_loss_mlp": 1.06403971, + "epoch": 0.6573682185455945, + "flos": 497346117120.0, + "grad_norm": 0.0566932008774944, + "language_loss": 0.87253273, + "learning_rate": 0.00027754103508560013, + "loss": 0.88332963, + "num_input_tokens_seen": 283721824, + "router_z_loss_mlp": 0.15637207, + "step": 3417, + "time_per_iteration": 2.63442063331604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083157, + "balance_loss_mlp": 1.06763625, + "epoch": 0.657560600230858, + "flos": 447465295872.0, + "grad_norm": 0.07288802055505218, + "language_loss": 0.82797182, + "learning_rate": 0.0002772620699209163, + "loss": 0.83880341, + "num_input_tokens_seen": 283786960, + "router_z_loss_mlp": 0.1550293, + "step": 3418, + "time_per_iteration": 2.5753917694091797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082218, + "balance_loss_mlp": 1.06645882, + "epoch": 0.6577529819161216, + "flos": 481940596224.0, + "grad_norm": 0.08272412201155772, + "language_loss": 0.79880875, + "learning_rate": 0.0002769831912307658, + "loss": 0.80963099, + "num_input_tokens_seen": 283853808, + "router_z_loss_mlp": 0.1574707, + "step": 3419, + "time_per_iteration": 2.57000732421875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087297, + "balance_loss_mlp": 1.07162082, + "epoch": 0.6579453636013851, + "flos": 530843134464.0, + "grad_norm": 0.09925745188023151, + "language_loss": 0.80301243, + "learning_rate": 0.00027670439912341917, + "loss": 0.81388539, + "num_input_tokens_seen": 283920960, + "router_z_loss_mlp": 0.15661621, + "step": 3420, + "time_per_iteration": 2.6356050968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108308, + "balance_loss_mlp": 1.06716549, + "epoch": 0.6581377452866487, + "flos": 628037743104.0, + "grad_norm": 0.0799291491057382, + "language_loss": 0.8347252, + "learning_rate": 0.0002764256937071129, + "loss": 0.84555596, + "num_input_tokens_seen": 283992416, + "router_z_loss_mlp": 0.15905762, + "step": 3421, + "time_per_iteration": 2.834642171859741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108912, + "balance_loss_mlp": 1.07362247, + "epoch": 0.6583301269719123, + "flos": 548618199552.0, + "grad_norm": 0.07238989087994178, + "language_loss": 0.87290871, + "learning_rate": 0.00027614707509005036, + "loss": 0.88379991, + "num_input_tokens_seen": 284061760, + "router_z_loss_mlp": 0.15478516, + "step": 3422, + "time_per_iteration": 2.725083112716675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094754, + "balance_loss_mlp": 1.0789113, + "epoch": 0.6585225086571759, + "flos": 427493583360.0, + "grad_norm": 0.07227108008700142, + "language_loss": 0.79197395, + "learning_rate": 0.0002758685433804008, + "loss": 0.80292153, + "num_input_tokens_seen": 284124848, + "router_z_loss_mlp": 0.15844727, + "step": 3423, + "time_per_iteration": 2.487705945968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088641, + "balance_loss_mlp": 1.07272685, + "epoch": 0.6587148903424394, + "flos": 859620542976.0, + "grad_norm": 0.08320385710428496, + "language_loss": 0.7929312, + "learning_rate": 0.00027559009868630005, + "loss": 0.80381757, + "num_input_tokens_seen": 284206272, + "router_z_loss_mlp": 0.15905762, + "step": 3424, + "time_per_iteration": 3.1158218383789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089311, + "balance_loss_mlp": 1.07396901, + "epoch": 0.6589072720277029, + "flos": 805630551552.0, + "grad_norm": 0.09716201844809616, + "language_loss": 0.79885697, + "learning_rate": 0.0002753117411158491, + "loss": 0.80975008, + "num_input_tokens_seen": 284293696, + "router_z_loss_mlp": 0.15319824, + "step": 3425, + "time_per_iteration": 3.073878049850464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087694, + "balance_loss_mlp": 1.07177925, + "epoch": 0.6590996537129665, + "flos": 548618199552.0, + "grad_norm": 0.07092925892223757, + "language_loss": 0.89722019, + "learning_rate": 0.0002750334707771168, + "loss": 0.90809715, + "num_input_tokens_seen": 284360192, + "router_z_loss_mlp": 0.15905762, + "step": 3426, + "time_per_iteration": 2.656442403793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088978, + "balance_loss_mlp": 1.07301569, + "epoch": 0.6592920353982301, + "flos": 454166092800.0, + "grad_norm": 0.08902896805701885, + "language_loss": 0.81059277, + "learning_rate": 0.0002747552877781369, + "loss": 0.82148254, + "num_input_tokens_seen": 284423680, + "router_z_loss_mlp": 0.1595459, + "step": 3427, + "time_per_iteration": 2.5491814613342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078822, + "balance_loss_mlp": 1.06326556, + "epoch": 0.6594844170834937, + "flos": 567174057984.0, + "grad_norm": 0.07347862596751242, + "language_loss": 0.82162035, + "learning_rate": 0.0002744771922269097, + "loss": 0.83240855, + "num_input_tokens_seen": 284495712, + "router_z_loss_mlp": 0.15539551, + "step": 3428, + "time_per_iteration": 2.7601091861724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079567, + "balance_loss_mlp": 1.06411695, + "epoch": 0.6596767987687572, + "flos": 1187911194624.0, + "grad_norm": 0.06220461144650757, + "language_loss": 0.81962168, + "learning_rate": 0.0002741991842314015, + "loss": 0.83041739, + "num_input_tokens_seen": 284583440, + "router_z_loss_mlp": 0.15429688, + "step": 3429, + "time_per_iteration": 3.5186891555786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073966, + "balance_loss_mlp": 1.05820632, + "epoch": 0.6598691804540208, + "flos": 503491147776.0, + "grad_norm": 0.08620393230779241, + "language_loss": 0.85952473, + "learning_rate": 0.0002739212638995445, + "loss": 0.87026429, + "num_input_tokens_seen": 284649168, + "router_z_loss_mlp": 0.15759277, + "step": 3430, + "time_per_iteration": 2.5625457763671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076808, + "balance_loss_mlp": 1.0606432, + "epoch": 0.6600615621392844, + "flos": 531337231872.0, + "grad_norm": 0.07532946000641748, + "language_loss": 0.82907259, + "learning_rate": 0.00027364343133923696, + "loss": 0.83984065, + "num_input_tokens_seen": 284723136, + "router_z_loss_mlp": 0.16162109, + "step": 3431, + "time_per_iteration": 2.716170310974121 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082258, + "balance_loss_mlp": 1.06649864, + "epoch": 0.6602539438245479, + "flos": 565446915072.0, + "grad_norm": 0.08436879524299454, + "language_loss": 0.82516879, + "learning_rate": 0.0002733656866583431, + "loss": 0.83599138, + "num_input_tokens_seen": 284792752, + "router_z_loss_mlp": 0.1574707, + "step": 3432, + "time_per_iteration": 2.718897581100464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081528, + "balance_loss_mlp": 1.06595933, + "epoch": 0.6604463255098114, + "flos": 857159594496.0, + "grad_norm": 0.09838634377037979, + "language_loss": 0.82723475, + "learning_rate": 0.0002730880299646927, + "loss": 0.83805001, + "num_input_tokens_seen": 284871008, + "router_z_loss_mlp": 0.15551758, + "step": 3433, + "time_per_iteration": 3.057305097579956 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077745, + "balance_loss_mlp": 1.06209278, + "epoch": 0.660638707195075, + "flos": 674462080512.0, + "grad_norm": 0.08984858233417439, + "language_loss": 0.85275245, + "learning_rate": 0.0002728104613660821, + "loss": 0.86352998, + "num_input_tokens_seen": 284945184, + "router_z_loss_mlp": 0.15637207, + "step": 3434, + "time_per_iteration": 2.9105958938598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088498, + "balance_loss_mlp": 1.0729531, + "epoch": 0.6608310888803386, + "flos": 888961402368.0, + "grad_norm": 0.07627448078148022, + "language_loss": 0.83088267, + "learning_rate": 0.0002725329809702729, + "loss": 0.84176767, + "num_input_tokens_seen": 285029296, + "router_z_loss_mlp": 0.15527344, + "step": 3435, + "time_per_iteration": 3.2121472358703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084827, + "balance_loss_mlp": 1.06942487, + "epoch": 0.6610234705656022, + "flos": 1136347646976.0, + "grad_norm": 0.07852463720700995, + "language_loss": 0.761163, + "learning_rate": 0.0002722555888849921, + "loss": 0.77201122, + "num_input_tokens_seen": 285124720, + "router_z_loss_mlp": 0.15380859, + "step": 3436, + "time_per_iteration": 3.4559333324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086334, + "balance_loss_mlp": 1.07087219, + "epoch": 0.6612158522508658, + "flos": 468012598272.0, + "grad_norm": 0.0792973185354779, + "language_loss": 0.80341804, + "learning_rate": 0.00027197828521793334, + "loss": 0.8142814, + "num_input_tokens_seen": 285191360, + "router_z_loss_mlp": 0.15441895, + "step": 3437, + "time_per_iteration": 2.5690367221832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083868, + "balance_loss_mlp": 1.06843066, + "epoch": 0.6614082339361292, + "flos": 571653614592.0, + "grad_norm": 0.06733043593984989, + "language_loss": 0.84736151, + "learning_rate": 0.0002717010700767552, + "loss": 0.85820019, + "num_input_tokens_seen": 285262624, + "router_z_loss_mlp": 0.1541748, + "step": 3438, + "time_per_iteration": 2.730611562728882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086291, + "balance_loss_mlp": 1.070472, + "epoch": 0.6616006156213928, + "flos": 498467934720.0, + "grad_norm": 0.08565730680483608, + "language_loss": 0.75949776, + "learning_rate": 0.00027142394356908226, + "loss": 0.77036071, + "num_input_tokens_seen": 285328512, + "router_z_loss_mlp": 0.15808105, + "step": 3439, + "time_per_iteration": 2.591198444366455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085276, + "balance_loss_mlp": 1.06955266, + "epoch": 0.6617929973066564, + "flos": 602420239872.0, + "grad_norm": 0.0872499602859338, + "language_loss": 0.84998727, + "learning_rate": 0.00027114690580250456, + "loss": 0.86083996, + "num_input_tokens_seen": 285406128, + "router_z_loss_mlp": 0.15710449, + "step": 3440, + "time_per_iteration": 2.8140435218811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_mlp": 1.07024395, + "epoch": 0.66198537899192, + "flos": 522983443968.0, + "grad_norm": 0.08876917190153409, + "language_loss": 0.87157035, + "learning_rate": 0.0002708699568845776, + "loss": 0.88242811, + "num_input_tokens_seen": 285474704, + "router_z_loss_mlp": 0.15515137, + "step": 3441, + "time_per_iteration": 2.6945717334747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009938, + "balance_loss_mlp": 1.0022608, + "epoch": 0.6621777606771835, + "flos": 1566256642560.0, + "grad_norm": 0.011925330418983318, + "language_loss": 0.79287779, + "learning_rate": 0.00027059309692282265, + "loss": 0.8029772, + "num_input_tokens_seen": 285698704, + "router_z_loss_mlp": 0.07666016, + "step": 3442, + "time_per_iteration": 2.3348398208618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084289, + "balance_loss_mlp": 1.06864882, + "epoch": 0.6623701423624471, + "flos": 526664954880.0, + "grad_norm": 0.08033260761499388, + "language_loss": 0.83087707, + "learning_rate": 0.0002703163260247261, + "loss": 0.84171999, + "num_input_tokens_seen": 285767936, + "router_z_loss_mlp": 0.15625, + "step": 3443, + "time_per_iteration": 2.6553611755371094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088719, + "balance_loss_mlp": 1.07361555, + "epoch": 0.6625625240477107, + "flos": 528179553792.0, + "grad_norm": 0.07663781352911521, + "language_loss": 0.81678534, + "learning_rate": 0.0002700396442977399, + "loss": 0.82767254, + "num_input_tokens_seen": 285839456, + "router_z_loss_mlp": 0.15087891, + "step": 3444, + "time_per_iteration": 2.6430938243865967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085928, + "balance_loss_mlp": 1.07055044, + "epoch": 0.6627549057329742, + "flos": 473122073088.0, + "grad_norm": 0.07089600928708202, + "language_loss": 0.8410843, + "learning_rate": 0.0002697630518492817, + "loss": 0.85194361, + "num_input_tokens_seen": 285905904, + "router_z_loss_mlp": 0.15356445, + "step": 3445, + "time_per_iteration": 2.6689202785491943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083941, + "balance_loss_mlp": 1.06836057, + "epoch": 0.6629472874182378, + "flos": 527996745216.0, + "grad_norm": 0.062094097648188885, + "language_loss": 0.85618681, + "learning_rate": 0.0002694865487867343, + "loss": 0.86702615, + "num_input_tokens_seen": 285975520, + "router_z_loss_mlp": 0.15563965, + "step": 3446, + "time_per_iteration": 2.637334108352661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080989, + "balance_loss_mlp": 1.0658257, + "epoch": 0.6631396691035013, + "flos": 613200471552.0, + "grad_norm": 0.05358697815550736, + "language_loss": 0.84430885, + "learning_rate": 0.0002692101352174453, + "loss": 0.85511881, + "num_input_tokens_seen": 286050320, + "router_z_loss_mlp": 0.15148926, + "step": 3447, + "time_per_iteration": 2.8312788009643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088399, + "balance_loss_mlp": 1.0726161, + "epoch": 0.6633320507887649, + "flos": 609318899712.0, + "grad_norm": 0.09194525285750592, + "language_loss": 0.84284896, + "learning_rate": 0.00026893381124872787, + "loss": 0.85373294, + "num_input_tokens_seen": 286120672, + "router_z_loss_mlp": 0.15771484, + "step": 3448, + "time_per_iteration": 2.736720323562622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088634, + "balance_loss_mlp": 1.0732677, + "epoch": 0.6635244324740285, + "flos": 749700873216.0, + "grad_norm": 0.07278095853134793, + "language_loss": 0.80550098, + "learning_rate": 0.00026865757698786097, + "loss": 0.8163873, + "num_input_tokens_seen": 286201152, + "router_z_loss_mlp": 0.15344238, + "step": 3449, + "time_per_iteration": 3.0738682746887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089191, + "balance_loss_mlp": 1.07368147, + "epoch": 0.6637168141592921, + "flos": 664526882304.0, + "grad_norm": 0.07101525688277542, + "language_loss": 0.8160826, + "learning_rate": 0.000268381432542088, + "loss": 0.82697451, + "num_input_tokens_seen": 286274512, + "router_z_loss_mlp": 0.15490723, + "step": 3450, + "time_per_iteration": 2.8799893856048584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079264, + "balance_loss_mlp": 1.06351662, + "epoch": 0.6639091958445555, + "flos": 606783799296.0, + "grad_norm": 0.07040751242592289, + "language_loss": 0.79836357, + "learning_rate": 0.00026810537801861807, + "loss": 0.80915618, + "num_input_tokens_seen": 286349808, + "router_z_loss_mlp": 0.15734863, + "step": 3451, + "time_per_iteration": 2.763744831085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080047, + "balance_loss_mlp": 1.06437111, + "epoch": 0.6641015775298191, + "flos": 476697498624.0, + "grad_norm": 0.06736920749763163, + "language_loss": 0.81280744, + "learning_rate": 0.0002678294135246243, + "loss": 0.82360792, + "num_input_tokens_seen": 286422912, + "router_z_loss_mlp": 0.15661621, + "step": 3452, + "time_per_iteration": 2.7365012168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077602, + "balance_loss_mlp": 1.06230712, + "epoch": 0.6642939592150827, + "flos": 904115105280.0, + "grad_norm": 0.09242159169716382, + "language_loss": 0.86158502, + "learning_rate": 0.0002675535391672463, + "loss": 0.872361, + "num_input_tokens_seen": 286501072, + "router_z_loss_mlp": 0.15270996, + "step": 3453, + "time_per_iteration": 3.123499870300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074699, + "balance_loss_mlp": 1.05919051, + "epoch": 0.6644863409003463, + "flos": 581808697344.0, + "grad_norm": 0.061204166777709124, + "language_loss": 0.86025405, + "learning_rate": 0.0002672777550535877, + "loss": 0.87100101, + "num_input_tokens_seen": 286580480, + "router_z_loss_mlp": 0.15490723, + "step": 3454, + "time_per_iteration": 2.810504913330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075076, + "balance_loss_mlp": 1.05937576, + "epoch": 0.6646787225856099, + "flos": 479002802688.0, + "grad_norm": 0.06544819992016976, + "language_loss": 0.85221612, + "learning_rate": 0.00026700206129071747, + "loss": 0.8629669, + "num_input_tokens_seen": 286646208, + "router_z_loss_mlp": 0.15686035, + "step": 3455, + "time_per_iteration": 2.55995774269104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077461, + "balance_loss_mlp": 1.0617609, + "epoch": 0.6648711042708734, + "flos": 449906420736.0, + "grad_norm": 0.0675654984198721, + "language_loss": 0.88715339, + "learning_rate": 0.00026672645798566925, + "loss": 0.897928, + "num_input_tokens_seen": 286710624, + "router_z_loss_mlp": 0.15698242, + "step": 3456, + "time_per_iteration": 2.535385847091675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070254, + "balance_loss_mlp": 1.05481672, + "epoch": 0.665063485956137, + "flos": 858960516096.0, + "grad_norm": 0.09542043769716892, + "language_loss": 0.79432011, + "learning_rate": 0.00026645094524544225, + "loss": 0.80502266, + "num_input_tokens_seen": 286799472, + "router_z_loss_mlp": 0.1541748, + "step": 3457, + "time_per_iteration": 3.309166193008423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076037, + "balance_loss_mlp": 1.05976462, + "epoch": 0.6652558676414005, + "flos": 604312939008.0, + "grad_norm": 0.07016306798114455, + "language_loss": 0.75077713, + "learning_rate": 0.00026617552317699945, + "loss": 0.76153749, + "num_input_tokens_seen": 286874752, + "router_z_loss_mlp": 0.16271973, + "step": 3458, + "time_per_iteration": 2.7976672649383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070804, + "balance_loss_mlp": 1.05473471, + "epoch": 0.6654482493266641, + "flos": 510394576896.0, + "grad_norm": 0.0843552623748082, + "language_loss": 0.87185872, + "learning_rate": 0.0002659001918872693, + "loss": 0.88256675, + "num_input_tokens_seen": 286943312, + "router_z_loss_mlp": 0.16052246, + "step": 3459, + "time_per_iteration": 2.619687080383301 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077284, + "balance_loss_mlp": 1.06169105, + "epoch": 0.6656406310119277, + "flos": 565605130752.0, + "grad_norm": 0.07816001351873277, + "language_loss": 0.81045449, + "learning_rate": 0.0002656249514831449, + "loss": 0.82122731, + "num_input_tokens_seen": 287010000, + "router_z_loss_mlp": 0.15576172, + "step": 3460, + "time_per_iteration": 2.658799409866333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081416, + "balance_loss_mlp": 1.06559706, + "epoch": 0.6658330126971912, + "flos": 1024298141184.0, + "grad_norm": 0.06934435326228823, + "language_loss": 0.87209398, + "learning_rate": 0.00026534980207148416, + "loss": 0.88290811, + "num_input_tokens_seen": 287101456, + "router_z_loss_mlp": 0.15808105, + "step": 3461, + "time_per_iteration": 3.4665892124176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080438, + "balance_loss_mlp": 1.06466627, + "epoch": 0.6660253943824548, + "flos": 816823388160.0, + "grad_norm": 0.08829817296611697, + "language_loss": 0.73540372, + "learning_rate": 0.0002650747437591097, + "loss": 0.74620807, + "num_input_tokens_seen": 287182848, + "router_z_loss_mlp": 0.15759277, + "step": 3462, + "time_per_iteration": 3.0266518592834473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011257, + "balance_loss_mlp": 1.00381792, + "epoch": 0.6662177760677184, + "flos": 1496169169920.0, + "grad_norm": 0.013095033581449731, + "language_loss": 0.8187958, + "learning_rate": 0.00026479977665280806, + "loss": 0.82890832, + "num_input_tokens_seen": 287417920, + "router_z_loss_mlp": 0.07421875, + "step": 3463, + "time_per_iteration": 5.052983045578003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079655, + "balance_loss_mlp": 1.06390786, + "epoch": 0.666410157752982, + "flos": 500120925696.0, + "grad_norm": 0.06636905454287767, + "language_loss": 0.8624109, + "learning_rate": 0.00026452490085933155, + "loss": 0.87320745, + "num_input_tokens_seen": 287483776, + "router_z_loss_mlp": 0.15734863, + "step": 3464, + "time_per_iteration": 2.6581099033355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108047, + "balance_loss_mlp": 1.06460285, + "epoch": 0.6666025394382454, + "flos": 481169714688.0, + "grad_norm": 0.08202270474579819, + "language_loss": 0.89890295, + "learning_rate": 0.00026425011648539614, + "loss": 0.90970761, + "num_input_tokens_seen": 287548176, + "router_z_loss_mlp": 0.15856934, + "step": 3465, + "time_per_iteration": 2.5627684593200684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087333, + "balance_loss_mlp": 1.07175291, + "epoch": 0.666794921123509, + "flos": 546653919744.0, + "grad_norm": 0.06505185619170838, + "language_loss": 0.82792681, + "learning_rate": 0.00026397542363768267, + "loss": 0.83880019, + "num_input_tokens_seen": 287618496, + "router_z_loss_mlp": 0.15563965, + "step": 3466, + "time_per_iteration": 2.6587183475494385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088421, + "balance_loss_mlp": 1.07291174, + "epoch": 0.6669873028087726, + "flos": 471988145664.0, + "grad_norm": 0.07283561194879179, + "language_loss": 0.8210032, + "learning_rate": 0.0002637008224228362, + "loss": 0.83188736, + "num_input_tokens_seen": 287684032, + "router_z_loss_mlp": 0.1550293, + "step": 3467, + "time_per_iteration": 2.5522584915161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104442, + "balance_loss_mlp": 1.08923101, + "epoch": 0.6671796844940362, + "flos": 547395065856.0, + "grad_norm": 0.11764859444366832, + "language_loss": 0.84226644, + "learning_rate": 0.00026342631294746653, + "loss": 0.85331088, + "num_input_tokens_seen": 287757680, + "router_z_loss_mlp": 0.15185547, + "step": 3468, + "time_per_iteration": 2.743476629257202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097687, + "balance_loss_mlp": 1.08171296, + "epoch": 0.6673720661792998, + "flos": 1070317214208.0, + "grad_norm": 0.06605655579182129, + "language_loss": 0.80559593, + "learning_rate": 0.0002631518953181476, + "loss": 0.81657279, + "num_input_tokens_seen": 287848992, + "router_z_loss_mlp": 0.15966797, + "step": 3469, + "time_per_iteration": 3.4870567321777344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020288, + "balance_loss_mlp": 1.01294446, + "epoch": 0.6675644478645633, + "flos": 1523790600192.0, + "grad_norm": 0.018081388353692853, + "language_loss": 0.76325285, + "learning_rate": 0.000262877569641418, + "loss": 0.77345574, + "num_input_tokens_seen": 288085680, + "router_z_loss_mlp": 0.07324219, + "step": 3470, + "time_per_iteration": 4.947405576705933 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111899, + "balance_loss_mlp": 1.09661639, + "epoch": 0.6677568295498268, + "flos": 579696113664.0, + "grad_norm": 0.07102946995749101, + "language_loss": 0.80474597, + "learning_rate": 0.00026260333602377985, + "loss": 0.81586492, + "num_input_tokens_seen": 288161568, + "router_z_loss_mlp": 0.15258789, + "step": 3471, + "time_per_iteration": 2.766829490661621 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111832, + "balance_loss_mlp": 1.10340738, + "epoch": 0.6679492112350904, + "flos": 383935458816.0, + "grad_norm": 0.07718732975818399, + "language_loss": 0.87198675, + "learning_rate": 0.0002623291945717007, + "loss": 0.88316995, + "num_input_tokens_seen": 288224032, + "router_z_loss_mlp": 0.14892578, + "step": 3472, + "time_per_iteration": 2.442732095718384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119016, + "balance_loss_mlp": 1.10393596, + "epoch": 0.668141592920354, + "flos": 1150759830528.0, + "grad_norm": 0.08543337632227696, + "language_loss": 0.83852649, + "learning_rate": 0.00026205514539161175, + "loss": 0.84971666, + "num_input_tokens_seen": 288312912, + "router_z_loss_mlp": 0.15063477, + "step": 3473, + "time_per_iteration": 3.555755376815796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110415, + "balance_loss_mlp": 1.09519196, + "epoch": 0.6683339746056175, + "flos": 561100608000.0, + "grad_norm": 0.08303995839416844, + "language_loss": 0.84348154, + "learning_rate": 0.00026178118858990773, + "loss": 0.85458577, + "num_input_tokens_seen": 288394224, + "router_z_loss_mlp": 0.15209961, + "step": 3474, + "time_per_iteration": 2.871601104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101729, + "balance_loss_mlp": 1.0864346, + "epoch": 0.6685263562908811, + "flos": 514305884160.0, + "grad_norm": 0.07741994536037751, + "language_loss": 0.84034884, + "learning_rate": 0.0002615073242729483, + "loss": 0.8513661, + "num_input_tokens_seen": 288462976, + "router_z_loss_mlp": 0.15283203, + "step": 3475, + "time_per_iteration": 2.643118381500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105115, + "balance_loss_mlp": 1.08948672, + "epoch": 0.6687187379761447, + "flos": 629772226560.0, + "grad_norm": 0.06829257920767222, + "language_loss": 0.84361756, + "learning_rate": 0.0002612335525470573, + "loss": 0.85466868, + "num_input_tokens_seen": 288542032, + "router_z_loss_mlp": 0.15612793, + "step": 3476, + "time_per_iteration": 2.819558620452881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095918, + "balance_loss_mlp": 1.0802896, + "epoch": 0.6689111196614083, + "flos": 535586992128.0, + "grad_norm": 0.0704314528476978, + "language_loss": 0.77963984, + "learning_rate": 0.0002609598735185221, + "loss": 0.79059905, + "num_input_tokens_seen": 288610704, + "router_z_loss_mlp": 0.15612793, + "step": 3477, + "time_per_iteration": 2.690528631210327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087074, + "balance_loss_mlp": 1.07150483, + "epoch": 0.6691035013466718, + "flos": 603038048256.0, + "grad_norm": 0.061090592598961185, + "language_loss": 0.83056384, + "learning_rate": 0.00026068628729359445, + "loss": 0.8414346, + "num_input_tokens_seen": 288686080, + "router_z_loss_mlp": 0.15563965, + "step": 3478, + "time_per_iteration": 2.78446364402771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085998, + "balance_loss_mlp": 1.07035732, + "epoch": 0.6692958830319353, + "flos": 632855752704.0, + "grad_norm": 0.06337221055834462, + "language_loss": 0.76063514, + "learning_rate": 0.00026041279397848996, + "loss": 0.7714951, + "num_input_tokens_seen": 288764944, + "router_z_loss_mlp": 0.15625, + "step": 3479, + "time_per_iteration": 2.868635654449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_mlp": 1.06561804, + "epoch": 0.6694882647171989, + "flos": 645471783936.0, + "grad_norm": 0.06666053419548093, + "language_loss": 0.82793164, + "learning_rate": 0.00026013939367938797, + "loss": 0.83874393, + "num_input_tokens_seen": 288847856, + "router_z_loss_mlp": 0.15600586, + "step": 3480, + "time_per_iteration": 2.908998489379883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077089, + "balance_loss_mlp": 1.0618062, + "epoch": 0.6696806464024625, + "flos": 569585447424.0, + "grad_norm": 0.068968619676177, + "language_loss": 0.80981463, + "learning_rate": 0.00025986608650243204, + "loss": 0.82058555, + "num_input_tokens_seen": 288929360, + "router_z_loss_mlp": 0.15258789, + "step": 3481, + "time_per_iteration": 2.834899663925171 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073739, + "balance_loss_mlp": 1.05738401, + "epoch": 0.6698730280877261, + "flos": 622700669952.0, + "grad_norm": 0.06754459840713438, + "language_loss": 0.79205263, + "learning_rate": 0.0002595928725537293, + "loss": 0.80279005, + "num_input_tokens_seen": 289010160, + "router_z_loss_mlp": 0.16357422, + "step": 3482, + "time_per_iteration": 2.8607449531555176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075932, + "balance_loss_mlp": 1.0601126, + "epoch": 0.6700654097729896, + "flos": 502507722240.0, + "grad_norm": 0.07337554389160682, + "language_loss": 0.87835222, + "learning_rate": 0.0002593197519393509, + "loss": 0.88911158, + "num_input_tokens_seen": 289077392, + "router_z_loss_mlp": 0.15808105, + "step": 3483, + "time_per_iteration": 2.600332021713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074206, + "balance_loss_mlp": 1.05839944, + "epoch": 0.6702577914582531, + "flos": 623876815872.0, + "grad_norm": 0.05836331293151798, + "language_loss": 0.79302973, + "learning_rate": 0.00025904672476533165, + "loss": 0.80377179, + "num_input_tokens_seen": 289157248, + "router_z_loss_mlp": 0.15795898, + "step": 3484, + "time_per_iteration": 2.883197546005249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073722, + "balance_loss_mlp": 1.05790257, + "epoch": 0.6704501731435167, + "flos": 456268764672.0, + "grad_norm": 0.07153016836752667, + "language_loss": 0.82456666, + "learning_rate": 0.0002587737911376704, + "loss": 0.8353039, + "num_input_tokens_seen": 289224864, + "router_z_loss_mlp": 0.15808105, + "step": 3485, + "time_per_iteration": 2.6315789222717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066049, + "balance_loss_mlp": 1.04982471, + "epoch": 0.6706425548287803, + "flos": 543229369344.0, + "grad_norm": 0.07097685042918324, + "language_loss": 0.84046322, + "learning_rate": 0.00025850095116232885, + "loss": 0.85112369, + "num_input_tokens_seen": 289293488, + "router_z_loss_mlp": 0.16223145, + "step": 3486, + "time_per_iteration": 2.717060089111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067626, + "balance_loss_mlp": 1.05155683, + "epoch": 0.6708349365140439, + "flos": 633940494336.0, + "grad_norm": 0.07403350105376266, + "language_loss": 0.77802885, + "learning_rate": 0.000258228204945233, + "loss": 0.78870511, + "num_input_tokens_seen": 289370560, + "router_z_loss_mlp": 0.16052246, + "step": 3487, + "time_per_iteration": 2.933227777481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071197, + "balance_loss_mlp": 1.05525851, + "epoch": 0.6710273181993074, + "flos": 640747749888.0, + "grad_norm": 0.07338274313839728, + "language_loss": 0.8460936, + "learning_rate": 0.00025795555259227254, + "loss": 0.85680556, + "num_input_tokens_seen": 289440096, + "router_z_loss_mlp": 0.15930176, + "step": 3488, + "time_per_iteration": 2.817223072052002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.05141997, + "epoch": 0.671219699884571, + "flos": 553942789632.0, + "grad_norm": 0.07175152498694279, + "language_loss": 0.83673614, + "learning_rate": 0.00025768299420930046, + "loss": 0.84741139, + "num_input_tokens_seen": 289515808, + "router_z_loss_mlp": 0.16101074, + "step": 3489, + "time_per_iteration": 2.7990496158599854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070538, + "balance_loss_mlp": 1.05465984, + "epoch": 0.6714120815698346, + "flos": 731508433920.0, + "grad_norm": 0.1191691604479504, + "language_loss": 0.83219582, + "learning_rate": 0.0002574105299021332, + "loss": 0.84290123, + "num_input_tokens_seen": 289591344, + "router_z_loss_mlp": 0.15869141, + "step": 3490, + "time_per_iteration": 2.943882703781128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070424, + "balance_loss_mlp": 1.05427098, + "epoch": 0.6716044632550981, + "flos": 688664291328.0, + "grad_norm": 0.07146897272940887, + "language_loss": 0.84251606, + "learning_rate": 0.00025713815977655084, + "loss": 0.85322034, + "num_input_tokens_seen": 289672032, + "router_z_loss_mlp": 0.16149902, + "step": 3491, + "time_per_iteration": 2.896653890609741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072686, + "balance_loss_mlp": 1.05612803, + "epoch": 0.6717968449403616, + "flos": 460629752832.0, + "grad_norm": 0.08069380476860724, + "language_loss": 0.84602511, + "learning_rate": 0.0002568658839382969, + "loss": 0.85675204, + "num_input_tokens_seen": 289738304, + "router_z_loss_mlp": 0.16552734, + "step": 3492, + "time_per_iteration": 2.576414108276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071874, + "balance_loss_mlp": 1.05595946, + "epoch": 0.6719892266256252, + "flos": 501608360448.0, + "grad_norm": 0.08814414328325225, + "language_loss": 0.84382427, + "learning_rate": 0.00025659370249307814, + "loss": 0.85454303, + "num_input_tokens_seen": 289804304, + "router_z_loss_mlp": 0.15905762, + "step": 3493, + "time_per_iteration": 2.6682934761047363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107054, + "balance_loss_mlp": 1.05422044, + "epoch": 0.6721816083108888, + "flos": 683525081088.0, + "grad_norm": 0.0794608767189563, + "language_loss": 0.852211, + "learning_rate": 0.00025632161554656473, + "loss": 0.86291635, + "num_input_tokens_seen": 289877696, + "router_z_loss_mlp": 0.16320801, + "step": 3494, + "time_per_iteration": 2.8697800636291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074018, + "balance_loss_mlp": 1.05849671, + "epoch": 0.6723739899961524, + "flos": 585813980160.0, + "grad_norm": 0.07486756079223739, + "language_loss": 0.82214803, + "learning_rate": 0.00025604962320439017, + "loss": 0.83288819, + "num_input_tokens_seen": 289947296, + "router_z_loss_mlp": 0.1550293, + "step": 3495, + "time_per_iteration": 2.6910951137542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071203, + "balance_loss_mlp": 1.0550859, + "epoch": 0.672566371681416, + "flos": 506616519168.0, + "grad_norm": 0.07275570378871335, + "language_loss": 0.82281178, + "learning_rate": 0.0002557777255721516, + "loss": 0.83352381, + "num_input_tokens_seen": 290020080, + "router_z_loss_mlp": 0.16113281, + "step": 3496, + "time_per_iteration": 2.7872824668884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106895, + "balance_loss_mlp": 1.05311894, + "epoch": 0.6727587533666795, + "flos": 535671055872.0, + "grad_norm": 0.08498246937997968, + "language_loss": 0.80356646, + "learning_rate": 0.0002555059227554087, + "loss": 0.81425595, + "num_input_tokens_seen": 290094544, + "router_z_loss_mlp": 0.15820312, + "step": 3497, + "time_per_iteration": 2.740039110183716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074344, + "balance_loss_mlp": 1.05823898, + "epoch": 0.672951135051943, + "flos": 602832844800.0, + "grad_norm": 0.07033333271672201, + "language_loss": 0.78129387, + "learning_rate": 0.00025523421485968453, + "loss": 0.79203725, + "num_input_tokens_seen": 290173520, + "router_z_loss_mlp": 0.16088867, + "step": 3498, + "time_per_iteration": 2.835993528366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010723, + "balance_loss_mlp": 1.05608737, + "epoch": 0.6731435167372066, + "flos": 811315989504.0, + "grad_norm": 0.0769826573624616, + "language_loss": 0.85233575, + "learning_rate": 0.00025496260199046585, + "loss": 0.86305881, + "num_input_tokens_seen": 290248240, + "router_z_loss_mlp": 0.16210938, + "step": 3499, + "time_per_iteration": 2.9663631916046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072722, + "balance_loss_mlp": 1.05689144, + "epoch": 0.6733358984224702, + "flos": 611594468352.0, + "grad_norm": 0.06665243506821861, + "language_loss": 0.84537292, + "learning_rate": 0.000254691084253202, + "loss": 0.85610014, + "num_input_tokens_seen": 290326288, + "router_z_loss_mlp": 0.15820312, + "step": 3500, + "time_per_iteration": 2.8711283206939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069122, + "balance_loss_mlp": 1.05341005, + "epoch": 0.6735282801077337, + "flos": 558901762560.0, + "grad_norm": 0.07241548369558537, + "language_loss": 0.7720896, + "learning_rate": 0.00025441966175330567, + "loss": 0.78278077, + "num_input_tokens_seen": 290395984, + "router_z_loss_mlp": 0.15698242, + "step": 3501, + "time_per_iteration": 2.6806578636169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075273, + "balance_loss_mlp": 1.0595727, + "epoch": 0.6737206617929973, + "flos": 672433560576.0, + "grad_norm": 0.06443940895189408, + "language_loss": 0.79419506, + "learning_rate": 0.00025414833459615183, + "loss": 0.80494785, + "num_input_tokens_seen": 290470224, + "router_z_loss_mlp": 0.15686035, + "step": 3502, + "time_per_iteration": 2.8561007976531982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073331, + "balance_loss_mlp": 1.05758393, + "epoch": 0.6739130434782609, + "flos": 633446396928.0, + "grad_norm": 0.08973671276033753, + "language_loss": 0.80348676, + "learning_rate": 0.0002538771028870796, + "loss": 0.81422007, + "num_input_tokens_seen": 290542864, + "router_z_loss_mlp": 0.15734863, + "step": 3503, + "time_per_iteration": 2.8288323879241943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070997, + "balance_loss_mlp": 1.05505931, + "epoch": 0.6741054251635245, + "flos": 531445888512.0, + "grad_norm": 0.0592562662752063, + "language_loss": 0.81655902, + "learning_rate": 0.0002536059667313903, + "loss": 0.82726896, + "num_input_tokens_seen": 290617248, + "router_z_loss_mlp": 0.15930176, + "step": 3504, + "time_per_iteration": 2.7216036319732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065153, + "balance_loss_mlp": 1.04882145, + "epoch": 0.674297806848788, + "flos": 542604220416.0, + "grad_norm": 0.06237409857608865, + "language_loss": 0.89354527, + "learning_rate": 0.0002533349262343483, + "loss": 0.90419674, + "num_input_tokens_seen": 290690112, + "router_z_loss_mlp": 0.16333008, + "step": 3505, + "time_per_iteration": 2.6963226795196533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068691, + "balance_loss_mlp": 1.05269337, + "epoch": 0.6744901885340515, + "flos": 463523129856.0, + "grad_norm": 0.07295709640267785, + "language_loss": 0.82094926, + "learning_rate": 0.0002530639815011807, + "loss": 0.83163619, + "num_input_tokens_seen": 290756352, + "router_z_loss_mlp": 0.15991211, + "step": 3506, + "time_per_iteration": 2.5332834720611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064091, + "balance_loss_mlp": 1.04796219, + "epoch": 0.6746825702193151, + "flos": 631830481920.0, + "grad_norm": 0.07658306531614024, + "language_loss": 0.8492251, + "learning_rate": 0.0002527931326370781, + "loss": 0.85986602, + "num_input_tokens_seen": 290829776, + "router_z_loss_mlp": 0.16125488, + "step": 3507, + "time_per_iteration": 2.803288459777832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106573, + "balance_loss_mlp": 1.04962516, + "epoch": 0.6748749519045787, + "flos": 671146186752.0, + "grad_norm": 0.1018008766550388, + "language_loss": 0.83113879, + "learning_rate": 0.00025252237974719276, + "loss": 0.84179616, + "num_input_tokens_seen": 290900736, + "router_z_loss_mlp": 0.16101074, + "step": 3508, + "time_per_iteration": 2.8635196685791016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065757, + "balance_loss_mlp": 1.04993796, + "epoch": 0.6750673335898423, + "flos": 767102980608.0, + "grad_norm": 0.07402200210263096, + "language_loss": 0.80262941, + "learning_rate": 0.00025225172293664056, + "loss": 0.81328702, + "num_input_tokens_seen": 290981696, + "router_z_loss_mlp": 0.15808105, + "step": 3509, + "time_per_iteration": 3.0501134395599365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021392, + "balance_loss_mlp": 1.01371539, + "epoch": 0.6752597152751059, + "flos": 1512607675392.0, + "grad_norm": 0.015224351470046544, + "language_loss": 0.76933134, + "learning_rate": 0.00025198116231049954, + "loss": 0.77954531, + "num_input_tokens_seen": 291217888, + "router_z_loss_mlp": 0.07666016, + "step": 3510, + "time_per_iteration": 4.970271825790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067885, + "balance_loss_mlp": 1.05207801, + "epoch": 0.6754520969603693, + "flos": 687297996288.0, + "grad_norm": 0.07849919507555615, + "language_loss": 0.84722501, + "learning_rate": 0.00025171069797381106, + "loss": 0.8579039, + "num_input_tokens_seen": 291287856, + "router_z_loss_mlp": 0.15795898, + "step": 3511, + "time_per_iteration": 2.851109027862549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061123, + "balance_loss_mlp": 1.04545879, + "epoch": 0.6756444786456329, + "flos": 500577947136.0, + "grad_norm": 0.07844004767829481, + "language_loss": 0.81844723, + "learning_rate": 0.00025144033003157864, + "loss": 0.82905853, + "num_input_tokens_seen": 291354912, + "router_z_loss_mlp": 0.15649414, + "step": 3512, + "time_per_iteration": 2.674426794052124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065093, + "balance_loss_mlp": 1.04981041, + "epoch": 0.6758368603308965, + "flos": 492616940544.0, + "grad_norm": 0.07444066278959373, + "language_loss": 0.78994167, + "learning_rate": 0.00025117005858876806, + "loss": 0.80059266, + "num_input_tokens_seen": 291426816, + "router_z_loss_mlp": 0.15270996, + "step": 3513, + "time_per_iteration": 2.7095823287963867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065974, + "balance_loss_mlp": 1.05047631, + "epoch": 0.6760292420161601, + "flos": 555934233600.0, + "grad_norm": 0.07261266754873474, + "language_loss": 0.85087454, + "learning_rate": 0.000250899883750308, + "loss": 0.86153424, + "num_input_tokens_seen": 291497648, + "router_z_loss_mlp": 0.15490723, + "step": 3514, + "time_per_iteration": 2.7069034576416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069973, + "balance_loss_mlp": 1.05441582, + "epoch": 0.6762216237014236, + "flos": 607601668608.0, + "grad_norm": 0.07481063892368622, + "language_loss": 0.81707394, + "learning_rate": 0.00025062980562109006, + "loss": 0.82777369, + "num_input_tokens_seen": 291568080, + "router_z_loss_mlp": 0.15539551, + "step": 3515, + "time_per_iteration": 2.7273197174072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069966, + "balance_loss_mlp": 1.05483794, + "epoch": 0.6764140053866872, + "flos": 533785697280.0, + "grad_norm": 0.08230462896516925, + "language_loss": 0.82973194, + "learning_rate": 0.0002503598243059677, + "loss": 0.84043157, + "num_input_tokens_seen": 291644896, + "router_z_loss_mlp": 0.15100098, + "step": 3516, + "time_per_iteration": 2.7930819988250732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069319, + "balance_loss_mlp": 1.05384541, + "epoch": 0.6766063870719508, + "flos": 504810455040.0, + "grad_norm": 0.07282849285049217, + "language_loss": 0.79984844, + "learning_rate": 0.0002500899399097568, + "loss": 0.81054163, + "num_input_tokens_seen": 291716864, + "router_z_loss_mlp": 0.15466309, + "step": 3517, + "time_per_iteration": 2.685296058654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072996, + "balance_loss_mlp": 1.05737984, + "epoch": 0.6767987687572143, + "flos": 513176726016.0, + "grad_norm": 0.08174636424990783, + "language_loss": 0.85385978, + "learning_rate": 0.0002498201525372359, + "loss": 0.86458969, + "num_input_tokens_seen": 291786000, + "router_z_loss_mlp": 0.15600586, + "step": 3518, + "time_per_iteration": 2.606057643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076186, + "balance_loss_mlp": 1.06118929, + "epoch": 0.6769911504424779, + "flos": 525039128064.0, + "grad_norm": 0.0782780776412412, + "language_loss": 0.83019435, + "learning_rate": 0.00024955046229314584, + "loss": 0.84095621, + "num_input_tokens_seen": 291854768, + "router_z_loss_mlp": 0.14978027, + "step": 3519, + "time_per_iteration": 2.6214327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074949, + "balance_loss_mlp": 1.05932105, + "epoch": 0.6771835321277414, + "flos": 449896508928.0, + "grad_norm": 0.06363729030714552, + "language_loss": 0.87326723, + "learning_rate": 0.00024928086928218947, + "loss": 0.88401669, + "num_input_tokens_seen": 291918096, + "router_z_loss_mlp": 0.15612793, + "step": 3520, + "time_per_iteration": 2.5505292415618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082672, + "balance_loss_mlp": 1.06697249, + "epoch": 0.677375913813005, + "flos": 709349985792.0, + "grad_norm": 0.081240608795973, + "language_loss": 0.75931144, + "learning_rate": 0.00024901137360903216, + "loss": 0.77013814, + "num_input_tokens_seen": 291998752, + "router_z_loss_mlp": 0.15686035, + "step": 3521, + "time_per_iteration": 2.950173854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108816, + "balance_loss_mlp": 1.07259083, + "epoch": 0.6775682954982686, + "flos": 428420109312.0, + "grad_norm": 0.0718633892564106, + "language_loss": 0.80979002, + "learning_rate": 0.00024874197537830115, + "loss": 0.82067156, + "num_input_tokens_seen": 292065056, + "router_z_loss_mlp": 0.15551758, + "step": 3522, + "time_per_iteration": 2.5892205238342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090688, + "balance_loss_mlp": 1.07558465, + "epoch": 0.6777606771835322, + "flos": 437905626624.0, + "grad_norm": 0.07642815095579086, + "language_loss": 0.83230734, + "learning_rate": 0.00024847267469458684, + "loss": 0.84321427, + "num_input_tokens_seen": 292129248, + "router_z_loss_mlp": 0.15087891, + "step": 3523, + "time_per_iteration": 2.5798654556274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092549, + "balance_loss_mlp": 1.07682538, + "epoch": 0.6779530588687956, + "flos": 775442087424.0, + "grad_norm": 0.08524244815477096, + "language_loss": 0.77646792, + "learning_rate": 0.00024820347166244034, + "loss": 0.78739345, + "num_input_tokens_seen": 292206080, + "router_z_loss_mlp": 0.15710449, + "step": 3524, + "time_per_iteration": 3.0167675018310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096891, + "balance_loss_mlp": 1.0811317, + "epoch": 0.6781454405540592, + "flos": 571782094848.0, + "grad_norm": 0.06181044738082458, + "language_loss": 0.84706652, + "learning_rate": 0.0002479343663863755, + "loss": 0.85803545, + "num_input_tokens_seen": 292280192, + "router_z_loss_mlp": 0.1574707, + "step": 3525, + "time_per_iteration": 2.783334255218506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093963, + "balance_loss_mlp": 1.07813191, + "epoch": 0.6783378222393228, + "flos": 485026693632.0, + "grad_norm": 0.07100418030431462, + "language_loss": 0.76605028, + "learning_rate": 0.00024766535897086876, + "loss": 0.77698994, + "num_input_tokens_seen": 292347792, + "router_z_loss_mlp": 0.15820312, + "step": 3526, + "time_per_iteration": 2.5780889987945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090687, + "balance_loss_mlp": 1.07472503, + "epoch": 0.6785302039245864, + "flos": 482839958016.0, + "grad_norm": 0.07633985319518004, + "language_loss": 0.79213607, + "learning_rate": 0.0002473964495203578, + "loss": 0.80304295, + "num_input_tokens_seen": 292420032, + "router_z_loss_mlp": 0.15966797, + "step": 3527, + "time_per_iteration": 2.6945083141326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094522, + "balance_loss_mlp": 1.07897758, + "epoch": 0.67872258560985, + "flos": 524732608512.0, + "grad_norm": 0.0748712356502137, + "language_loss": 0.85111511, + "learning_rate": 0.0002471276381392425, + "loss": 0.86206043, + "num_input_tokens_seen": 292497792, + "router_z_loss_mlp": 0.15527344, + "step": 3528, + "time_per_iteration": 2.815568208694458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021348, + "balance_loss_mlp": 1.01443386, + "epoch": 0.6789149672951135, + "flos": 1552605428736.0, + "grad_norm": 0.02081860752447363, + "language_loss": 0.78188634, + "learning_rate": 0.0002468589249318848, + "loss": 0.79209983, + "num_input_tokens_seen": 292726704, + "router_z_loss_mlp": 0.06933594, + "step": 3529, + "time_per_iteration": 5.020832061767578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090569, + "balance_loss_mlp": 1.07503641, + "epoch": 0.6791073489803771, + "flos": 741406556160.0, + "grad_norm": 0.06927800077395886, + "language_loss": 0.83888638, + "learning_rate": 0.00024659031000260826, + "loss": 0.84979212, + "num_input_tokens_seen": 292802320, + "router_z_loss_mlp": 0.15515137, + "step": 3530, + "time_per_iteration": 2.9285757541656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085207, + "balance_loss_mlp": 1.06901824, + "epoch": 0.6792997306656406, + "flos": 576365538816.0, + "grad_norm": 0.07665905507677669, + "language_loss": 0.80867362, + "learning_rate": 0.0002463217934556985, + "loss": 0.81952572, + "num_input_tokens_seen": 292870480, + "router_z_loss_mlp": 0.16186523, + "step": 3531, + "time_per_iteration": 2.6685454845428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013718, + "balance_loss_mlp": 1.00685167, + "epoch": 0.6794921123509042, + "flos": 1503337273344.0, + "grad_norm": 0.012757439752192143, + "language_loss": 0.7653209, + "learning_rate": 0.000246053375395403, + "loss": 0.7754581, + "num_input_tokens_seen": 293100752, + "router_z_loss_mlp": 0.06884766, + "step": 3532, + "time_per_iteration": 4.770366191864014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108006, + "balance_loss_mlp": 1.06464624, + "epoch": 0.6796844940361677, + "flos": 698923261440.0, + "grad_norm": 0.08290950297443149, + "language_loss": 0.83307445, + "learning_rate": 0.0002457850559259306, + "loss": 0.84387505, + "num_input_tokens_seen": 293178192, + "router_z_loss_mlp": 0.15393066, + "step": 3533, + "time_per_iteration": 2.927530527114868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084943, + "balance_loss_mlp": 1.06996989, + "epoch": 0.6798768757214313, + "flos": 552759303168.0, + "grad_norm": 0.07118556269002856, + "language_loss": 0.81413895, + "learning_rate": 0.00024551683515145275, + "loss": 0.82498837, + "num_input_tokens_seen": 293246368, + "router_z_loss_mlp": 0.1496582, + "step": 3534, + "time_per_iteration": 2.7142248153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084982, + "balance_loss_mlp": 1.06973481, + "epoch": 0.6800692574066949, + "flos": 522936456192.0, + "grad_norm": 0.08177709716799147, + "language_loss": 0.86470234, + "learning_rate": 0.0002452487131761014, + "loss": 0.87555218, + "num_input_tokens_seen": 293320656, + "router_z_loss_mlp": 0.15222168, + "step": 3535, + "time_per_iteration": 2.7160773277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082392, + "balance_loss_mlp": 1.06722808, + "epoch": 0.6802616390919585, + "flos": 574023158784.0, + "grad_norm": 0.08194963503580213, + "language_loss": 0.79881543, + "learning_rate": 0.00024498069010397093, + "loss": 0.80963933, + "num_input_tokens_seen": 293388592, + "router_z_loss_mlp": 0.15136719, + "step": 3536, + "time_per_iteration": 2.7041375637054443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089238, + "balance_loss_mlp": 1.07374132, + "epoch": 0.6804540207772221, + "flos": 488157207552.0, + "grad_norm": 0.06629203560716768, + "language_loss": 0.85175467, + "learning_rate": 0.00024471276603911697, + "loss": 0.862647, + "num_input_tokens_seen": 293453936, + "router_z_loss_mlp": 0.15478516, + "step": 3537, + "time_per_iteration": 2.6187474727630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087967, + "balance_loss_mlp": 1.07295895, + "epoch": 0.6806464024624855, + "flos": 578594119680.0, + "grad_norm": 0.06748814664215633, + "language_loss": 0.79046237, + "learning_rate": 0.0002444449410855572, + "loss": 0.80134201, + "num_input_tokens_seen": 293527664, + "router_z_loss_mlp": 0.14990234, + "step": 3538, + "time_per_iteration": 2.7858455181121826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086378, + "balance_loss_mlp": 1.07165527, + "epoch": 0.6808387841477491, + "flos": 553722905088.0, + "grad_norm": 0.061176482277740064, + "language_loss": 0.83929294, + "learning_rate": 0.00024417721534727033, + "loss": 0.85015678, + "num_input_tokens_seen": 293599344, + "router_z_loss_mlp": 0.14697266, + "step": 3539, + "time_per_iteration": 2.651829957962036 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084901, + "balance_loss_mlp": 1.07020283, + "epoch": 0.6810311658330127, + "flos": 426841270272.0, + "grad_norm": 0.11547680699328156, + "language_loss": 0.83058399, + "learning_rate": 0.00024390958892819687, + "loss": 0.84143305, + "num_input_tokens_seen": 293663088, + "router_z_loss_mlp": 0.14685059, + "step": 3540, + "time_per_iteration": 2.4877548217773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095795, + "balance_loss_mlp": 1.08085859, + "epoch": 0.6812235475182763, + "flos": 572256368640.0, + "grad_norm": 0.08391920158567873, + "language_loss": 0.80917513, + "learning_rate": 0.0002436420619322381, + "loss": 0.82013303, + "num_input_tokens_seen": 293741296, + "router_z_loss_mlp": 0.14904785, + "step": 3541, + "time_per_iteration": 2.803321361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098037, + "balance_loss_mlp": 1.08268261, + "epoch": 0.6814159292035398, + "flos": 501917078016.0, + "grad_norm": 0.06869492380097451, + "language_loss": 0.82632923, + "learning_rate": 0.0002433746344632577, + "loss": 0.8373096, + "num_input_tokens_seen": 293815840, + "router_z_loss_mlp": 0.15332031, + "step": 3542, + "time_per_iteration": 2.6754159927368164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084182, + "balance_loss_mlp": 1.06911397, + "epoch": 0.6816083108888034, + "flos": 765531482112.0, + "grad_norm": 0.08922894517327895, + "language_loss": 0.79983473, + "learning_rate": 0.00024310730662508006, + "loss": 0.81067657, + "num_input_tokens_seen": 293896368, + "router_z_loss_mlp": 0.15039062, + "step": 3543, + "time_per_iteration": 3.0928573608398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088886, + "balance_loss_mlp": 1.07410395, + "epoch": 0.681800692574067, + "flos": 479459824128.0, + "grad_norm": 0.06644129249236999, + "language_loss": 0.87379378, + "learning_rate": 0.0002428400785214911, + "loss": 0.88468266, + "num_input_tokens_seen": 293963344, + "router_z_loss_mlp": 0.14758301, + "step": 3544, + "time_per_iteration": 2.604871988296509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085069, + "balance_loss_mlp": 1.07040668, + "epoch": 0.6819930742593305, + "flos": 691604656128.0, + "grad_norm": 0.06899177555305853, + "language_loss": 0.8269285, + "learning_rate": 0.00024257295025623794, + "loss": 0.83777916, + "num_input_tokens_seen": 294035440, + "router_z_loss_mlp": 0.14648438, + "step": 3545, + "time_per_iteration": 2.871487617492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087393, + "balance_loss_mlp": 1.07256377, + "epoch": 0.6821854559445941, + "flos": 678096603648.0, + "grad_norm": 0.06856452651542601, + "language_loss": 0.80378067, + "learning_rate": 0.00024230592193302892, + "loss": 0.81465465, + "num_input_tokens_seen": 294116944, + "router_z_loss_mlp": 0.14807129, + "step": 3546, + "time_per_iteration": 2.9170467853546143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082599, + "balance_loss_mlp": 1.06824601, + "epoch": 0.6823778376298576, + "flos": 462191339520.0, + "grad_norm": 0.06947354810539072, + "language_loss": 0.84322613, + "learning_rate": 0.00024203899365553372, + "loss": 0.85405213, + "num_input_tokens_seen": 294178976, + "router_z_loss_mlp": 0.14343262, + "step": 3547, + "time_per_iteration": 2.5574960708618164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018037, + "balance_loss_mlp": 1.01088417, + "epoch": 0.6825702193151212, + "flos": 1475298842112.0, + "grad_norm": 0.009467929777895517, + "language_loss": 0.76734358, + "learning_rate": 0.00024177216552738302, + "loss": 0.77752393, + "num_input_tokens_seen": 294384960, + "router_z_loss_mlp": 0.07128906, + "step": 3548, + "time_per_iteration": 4.566468954086304 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081922, + "balance_loss_mlp": 1.06721163, + "epoch": 0.6827626010003848, + "flos": 723114998784.0, + "grad_norm": 0.07355519456276065, + "language_loss": 0.83346403, + "learning_rate": 0.00024150543765216848, + "loss": 0.84428328, + "num_input_tokens_seen": 294461408, + "router_z_loss_mlp": 0.14697266, + "step": 3549, + "time_per_iteration": 2.9388909339904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086551, + "balance_loss_mlp": 1.07185233, + "epoch": 0.6829549826856484, + "flos": 558864686592.0, + "grad_norm": 0.27537387522176376, + "language_loss": 0.83331466, + "learning_rate": 0.00024123881013344352, + "loss": 0.84418023, + "num_input_tokens_seen": 294530624, + "router_z_loss_mlp": 0.14685059, + "step": 3550, + "time_per_iteration": 2.683187484741211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090347, + "balance_loss_mlp": 1.07533836, + "epoch": 0.6831473643709118, + "flos": 624934393344.0, + "grad_norm": 0.060306055735835584, + "language_loss": 0.79340541, + "learning_rate": 0.00024097228307472202, + "loss": 0.80430889, + "num_input_tokens_seen": 294606784, + "router_z_loss_mlp": 0.14990234, + "step": 3551, + "time_per_iteration": 2.844684362411499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092152, + "balance_loss_mlp": 1.07727528, + "epoch": 0.6833397460561754, + "flos": 713861849088.0, + "grad_norm": 0.10551739620621807, + "language_loss": 0.82146621, + "learning_rate": 0.00024070585657947846, + "loss": 0.83238769, + "num_input_tokens_seen": 294686960, + "router_z_loss_mlp": 0.14855957, + "step": 3552, + "time_per_iteration": 2.8860158920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094236, + "balance_loss_mlp": 1.07928681, + "epoch": 0.683532127741439, + "flos": 464704045056.0, + "grad_norm": 0.060639169561421215, + "language_loss": 0.85149372, + "learning_rate": 0.00024043953075114934, + "loss": 0.86243612, + "num_input_tokens_seen": 294759712, + "router_z_loss_mlp": 0.14941406, + "step": 3553, + "time_per_iteration": 2.677131175994873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092464, + "balance_loss_mlp": 1.07733643, + "epoch": 0.6837245094267026, + "flos": 582251037696.0, + "grad_norm": 0.09003750211416942, + "language_loss": 0.88845998, + "learning_rate": 0.00024017330569313128, + "loss": 0.89938462, + "num_input_tokens_seen": 294830592, + "router_z_loss_mlp": 0.15100098, + "step": 3554, + "time_per_iteration": 2.7099804878234863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089308, + "balance_loss_mlp": 1.07441902, + "epoch": 0.6839168911119662, + "flos": 794173413888.0, + "grad_norm": 0.06693310878195398, + "language_loss": 0.74663389, + "learning_rate": 0.0002399071815087821, + "loss": 0.75752699, + "num_input_tokens_seen": 294907504, + "router_z_loss_mlp": 0.14855957, + "step": 3555, + "time_per_iteration": 3.0262036323547363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093717, + "balance_loss_mlp": 1.07882786, + "epoch": 0.6841092727972297, + "flos": 580009973760.0, + "grad_norm": 0.08039204780862134, + "language_loss": 0.8364749, + "learning_rate": 0.00023964115830142025, + "loss": 0.84741211, + "num_input_tokens_seen": 294977600, + "router_z_loss_mlp": 0.14880371, + "step": 3556, + "time_per_iteration": 2.7107839584350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085439, + "balance_loss_mlp": 1.07070458, + "epoch": 0.6843016544824932, + "flos": 383742738432.0, + "grad_norm": 0.09666419591078326, + "language_loss": 0.87049448, + "learning_rate": 0.00023937523617432522, + "loss": 0.88134885, + "num_input_tokens_seen": 295039408, + "router_z_loss_mlp": 0.14709473, + "step": 3557, + "time_per_iteration": 2.468167781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082368, + "balance_loss_mlp": 1.06743097, + "epoch": 0.6844940361677568, + "flos": 1439035476480.0, + "grad_norm": 0.09214096887844989, + "language_loss": 0.86638856, + "learning_rate": 0.00023910941523073705, + "loss": 0.87721217, + "num_input_tokens_seen": 295142928, + "router_z_loss_mlp": 0.14916992, + "step": 3558, + "time_per_iteration": 3.9070351123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081618, + "balance_loss_mlp": 1.06658614, + "epoch": 0.6846864178530204, + "flos": 520870860288.0, + "grad_norm": 0.07061573150035702, + "language_loss": 0.8673026, + "learning_rate": 0.0002388436955738566, + "loss": 0.87811875, + "num_input_tokens_seen": 295215504, + "router_z_loss_mlp": 0.15002441, + "step": 3559, + "time_per_iteration": 2.6991312503814697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079541, + "balance_loss_mlp": 1.06413877, + "epoch": 0.6848787995382839, + "flos": 717946053120.0, + "grad_norm": 0.09206834643003141, + "language_loss": 0.81440175, + "learning_rate": 0.00023857807730684523, + "loss": 0.8251971, + "num_input_tokens_seen": 295291024, + "router_z_loss_mlp": 0.15380859, + "step": 3560, + "time_per_iteration": 2.8983073234558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081366, + "balance_loss_mlp": 1.06604719, + "epoch": 0.6850711812235475, + "flos": 511061571072.0, + "grad_norm": 0.09081340298912512, + "language_loss": 0.8223151, + "learning_rate": 0.00023831256053282547, + "loss": 0.83312881, + "num_input_tokens_seen": 295363248, + "router_z_loss_mlp": 0.1529541, + "step": 3561, + "time_per_iteration": 2.7063052654266357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080271, + "balance_loss_mlp": 1.06454742, + "epoch": 0.6852635629088111, + "flos": 668151493632.0, + "grad_norm": 0.14586084526989435, + "language_loss": 0.78329659, + "learning_rate": 0.00023804714535488003, + "loss": 0.79409927, + "num_input_tokens_seen": 295442032, + "router_z_loss_mlp": 0.15710449, + "step": 3562, + "time_per_iteration": 2.9026236534118652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012311, + "balance_loss_mlp": 1.00525403, + "epoch": 0.6854559445940747, + "flos": 1522980071424.0, + "grad_norm": 0.007277946766615099, + "language_loss": 0.7980963, + "learning_rate": 0.0002377818318760519, + "loss": 0.80821943, + "num_input_tokens_seen": 295680560, + "router_z_loss_mlp": 0.07080078, + "step": 3563, + "time_per_iteration": 4.994298696517944 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083331, + "balance_loss_mlp": 1.06790555, + "epoch": 0.6856483262793382, + "flos": 454203168768.0, + "grad_norm": 0.0750757790304577, + "language_loss": 0.80663192, + "learning_rate": 0.00023751662019934488, + "loss": 0.81746531, + "num_input_tokens_seen": 295745712, + "router_z_loss_mlp": 0.1541748, + "step": 3564, + "time_per_iteration": 2.6247575283050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087229, + "balance_loss_mlp": 1.07223213, + "epoch": 0.6858407079646017, + "flos": 615552763392.0, + "grad_norm": 0.06101613558394618, + "language_loss": 0.79368252, + "learning_rate": 0.00023725151042772364, + "loss": 0.80455482, + "num_input_tokens_seen": 295815104, + "router_z_loss_mlp": 0.1496582, + "step": 3565, + "time_per_iteration": 2.733064651489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088358, + "balance_loss_mlp": 1.07328987, + "epoch": 0.6860330896498653, + "flos": 466053087744.0, + "grad_norm": 0.06639135766618469, + "language_loss": 0.83069307, + "learning_rate": 0.00023698650266411276, + "loss": 0.8415767, + "num_input_tokens_seen": 295882928, + "router_z_loss_mlp": 0.1505127, + "step": 3566, + "time_per_iteration": 2.6350362300872803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091248, + "balance_loss_mlp": 1.07638311, + "epoch": 0.6862254713351289, + "flos": 864270425088.0, + "grad_norm": 0.08738258373054857, + "language_loss": 0.83224273, + "learning_rate": 0.00023672159701139755, + "loss": 0.84315515, + "num_input_tokens_seen": 295970960, + "router_z_loss_mlp": 0.14831543, + "step": 3567, + "time_per_iteration": 3.2377495765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093919, + "balance_loss_mlp": 1.07899451, + "epoch": 0.6864178530203925, + "flos": 447141523968.0, + "grad_norm": 0.08115353052200597, + "language_loss": 0.86123919, + "learning_rate": 0.00023645679357242296, + "loss": 0.87217844, + "num_input_tokens_seen": 296036128, + "router_z_loss_mlp": 0.14904785, + "step": 3568, + "time_per_iteration": 2.5912718772888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085209, + "balance_loss_mlp": 1.07000983, + "epoch": 0.6866102347056561, + "flos": 424269093888.0, + "grad_norm": 0.0774589822263595, + "language_loss": 0.84057611, + "learning_rate": 0.00023619209244999534, + "loss": 0.85142827, + "num_input_tokens_seen": 296101440, + "router_z_loss_mlp": 0.1517334, + "step": 3569, + "time_per_iteration": 2.5609703063964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088355, + "balance_loss_mlp": 1.07344151, + "epoch": 0.6868026163909196, + "flos": 472373586432.0, + "grad_norm": 0.09034321435408287, + "language_loss": 0.84892517, + "learning_rate": 0.0002359274937468806, + "loss": 0.85980874, + "num_input_tokens_seen": 296165504, + "router_z_loss_mlp": 0.14904785, + "step": 3570, + "time_per_iteration": 2.5558407306671143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088136, + "balance_loss_mlp": 1.07292545, + "epoch": 0.6869949980761831, + "flos": 464190124032.0, + "grad_norm": 0.06600150491897518, + "language_loss": 0.78017968, + "learning_rate": 0.00023566299756580512, + "loss": 0.79106104, + "num_input_tokens_seen": 296236880, + "router_z_loss_mlp": 0.15185547, + "step": 3571, + "time_per_iteration": 2.6505472660064697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094782, + "balance_loss_mlp": 1.07961917, + "epoch": 0.6871873797614467, + "flos": 426235944960.0, + "grad_norm": 0.08793371373837118, + "language_loss": 0.78414327, + "learning_rate": 0.0002353986040094551, + "loss": 0.79509115, + "num_input_tokens_seen": 296299776, + "router_z_loss_mlp": 0.15136719, + "step": 3572, + "time_per_iteration": 2.510256290435791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093503, + "balance_loss_mlp": 1.07853007, + "epoch": 0.6873797614467103, + "flos": 443625569280.0, + "grad_norm": 0.08501423170750884, + "language_loss": 0.79296732, + "learning_rate": 0.00023513431318047796, + "loss": 0.80390239, + "num_input_tokens_seen": 296365408, + "router_z_loss_mlp": 0.14953613, + "step": 3573, + "time_per_iteration": 2.5400164127349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086278, + "balance_loss_mlp": 1.07101965, + "epoch": 0.6875721431319738, + "flos": 992323436544.0, + "grad_norm": 0.07288870578041759, + "language_loss": 0.76573622, + "learning_rate": 0.00023487012518147977, + "loss": 0.77659905, + "num_input_tokens_seen": 296445488, + "router_z_loss_mlp": 0.15234375, + "step": 3574, + "time_per_iteration": 3.248400926589966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084828, + "balance_loss_mlp": 1.06955671, + "epoch": 0.6877645248172374, + "flos": 1285513638912.0, + "grad_norm": 0.0698790191488345, + "language_loss": 0.84093738, + "learning_rate": 0.00023460604011502772, + "loss": 0.85178566, + "num_input_tokens_seen": 296529936, + "router_z_loss_mlp": 0.15258789, + "step": 3575, + "time_per_iteration": 3.6553032398223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085647, + "balance_loss_mlp": 1.07138944, + "epoch": 0.687956906502501, + "flos": 876733383168.0, + "grad_norm": 0.0800354404876214, + "language_loss": 0.85504699, + "learning_rate": 0.00023434205808364845, + "loss": 0.8659035, + "num_input_tokens_seen": 296607488, + "router_z_loss_mlp": 0.1427002, + "step": 3576, + "time_per_iteration": 3.173497200012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094726, + "balance_loss_mlp": 1.07970524, + "epoch": 0.6881492881877646, + "flos": 563324419584.0, + "grad_norm": 0.07355938881939053, + "language_loss": 0.8520726, + "learning_rate": 0.00023407817918982932, + "loss": 0.86301988, + "num_input_tokens_seen": 296678672, + "router_z_loss_mlp": 0.14990234, + "step": 3577, + "time_per_iteration": 2.810378313064575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094391, + "balance_loss_mlp": 1.07960927, + "epoch": 0.6883416698730281, + "flos": 795127104000.0, + "grad_norm": 0.06289078804693891, + "language_loss": 0.78850877, + "learning_rate": 0.00023381440353601718, + "loss": 0.79945266, + "num_input_tokens_seen": 296758896, + "router_z_loss_mlp": 0.14758301, + "step": 3578, + "time_per_iteration": 3.0247299671173096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091227, + "balance_loss_mlp": 1.07633758, + "epoch": 0.6885340515582916, + "flos": 723621579264.0, + "grad_norm": 0.07119192926976899, + "language_loss": 0.85820395, + "learning_rate": 0.00023355073122461822, + "loss": 0.86911619, + "num_input_tokens_seen": 296830736, + "router_z_loss_mlp": 0.14868164, + "step": 3579, + "time_per_iteration": 2.90800404548645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094317, + "balance_loss_mlp": 1.07949877, + "epoch": 0.6887264332435552, + "flos": 1010926282752.0, + "grad_norm": 0.07022836851030782, + "language_loss": 0.82529831, + "learning_rate": 0.00023328716235799973, + "loss": 0.83624148, + "num_input_tokens_seen": 296911504, + "router_z_loss_mlp": 0.14782715, + "step": 3580, + "time_per_iteration": 3.300455331802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100506, + "balance_loss_mlp": 1.08599877, + "epoch": 0.6889188149288188, + "flos": 585262983168.0, + "grad_norm": 0.08437878588236032, + "language_loss": 0.8341161, + "learning_rate": 0.00023302369703848803, + "loss": 0.84512115, + "num_input_tokens_seen": 296981488, + "router_z_loss_mlp": 0.14489746, + "step": 3581, + "time_per_iteration": 2.6898550987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098143, + "balance_loss_mlp": 1.08326566, + "epoch": 0.6891111966140824, + "flos": 636119889408.0, + "grad_norm": 0.08155365941467911, + "language_loss": 0.80103743, + "learning_rate": 0.00023276033536836937, + "loss": 0.81201887, + "num_input_tokens_seen": 297054896, + "router_z_loss_mlp": 0.14868164, + "step": 3582, + "time_per_iteration": 2.7915916442871094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109, + "balance_loss_mlp": 1.07499123, + "epoch": 0.6893035782993459, + "flos": 495270609408.0, + "grad_norm": 0.0619697140217233, + "language_loss": 0.84551424, + "learning_rate": 0.00023249707744988984, + "loss": 0.8564142, + "num_input_tokens_seen": 297128224, + "router_z_loss_mlp": 0.14990234, + "step": 3583, + "time_per_iteration": 2.659757375717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092214, + "balance_loss_mlp": 1.07747972, + "epoch": 0.6894959599846094, + "flos": 458215792128.0, + "grad_norm": 0.08143589972695583, + "language_loss": 0.82035959, + "learning_rate": 0.00023223392338525529, + "loss": 0.83128172, + "num_input_tokens_seen": 297191312, + "router_z_loss_mlp": 0.1472168, + "step": 3584, + "time_per_iteration": 2.5301597118377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094389, + "balance_loss_mlp": 1.07959485, + "epoch": 0.689688341669873, + "flos": 505003175424.0, + "grad_norm": 0.21421019470066024, + "language_loss": 0.78488421, + "learning_rate": 0.00023197087327663107, + "loss": 0.79582822, + "num_input_tokens_seen": 297261904, + "router_z_loss_mlp": 0.14770508, + "step": 3585, + "time_per_iteration": 2.679481029510498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096698, + "balance_loss_mlp": 1.08208311, + "epoch": 0.6898807233551366, + "flos": 763910797824.0, + "grad_norm": 0.06326504558768707, + "language_loss": 0.81044286, + "learning_rate": 0.00023170792722614243, + "loss": 0.82140982, + "num_input_tokens_seen": 297338352, + "router_z_loss_mlp": 0.14599609, + "step": 3586, + "time_per_iteration": 2.9200220108032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099186, + "balance_loss_mlp": 1.08436847, + "epoch": 0.6900731050404002, + "flos": 583337977344.0, + "grad_norm": 0.05947736637449061, + "language_loss": 0.83560526, + "learning_rate": 0.00023144508533587377, + "loss": 0.84659708, + "num_input_tokens_seen": 297416688, + "router_z_loss_mlp": 0.14819336, + "step": 3587, + "time_per_iteration": 2.8857367038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098984, + "balance_loss_mlp": 1.08429766, + "epoch": 0.6902654867256637, + "flos": 711865262592.0, + "grad_norm": 0.08877001768581633, + "language_loss": 0.78586876, + "learning_rate": 0.0002311823477078698, + "loss": 0.79685855, + "num_input_tokens_seen": 297499968, + "router_z_loss_mlp": 0.14660645, + "step": 3588, + "time_per_iteration": 2.9328413009643555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107388, + "balance_loss_mlp": 1.09263027, + "epoch": 0.6904578684109273, + "flos": 597112902144.0, + "grad_norm": 0.06868681048998228, + "language_loss": 0.85218358, + "learning_rate": 0.00023091971444413428, + "loss": 0.86325753, + "num_input_tokens_seen": 297574480, + "router_z_loss_mlp": 0.14733887, + "step": 3589, + "time_per_iteration": 2.8086462020874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104804, + "balance_loss_mlp": 1.09003377, + "epoch": 0.6906502500961909, + "flos": 585040527360.0, + "grad_norm": 0.06776060090181614, + "language_loss": 0.82496858, + "learning_rate": 0.00023065718564663012, + "loss": 0.83601665, + "num_input_tokens_seen": 297645360, + "router_z_loss_mlp": 0.14733887, + "step": 3590, + "time_per_iteration": 2.7810418605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_mlp": 1.0248282, + "epoch": 0.6908426317814544, + "flos": 1587827017728.0, + "grad_norm": 0.01280069102087921, + "language_loss": 0.73911589, + "learning_rate": 0.00023039476141728011, + "loss": 0.7494362, + "num_input_tokens_seen": 297879472, + "router_z_loss_mlp": 0.07177734, + "step": 3591, + "time_per_iteration": 5.021310806274414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098984, + "balance_loss_mlp": 1.08440506, + "epoch": 0.6910350134667179, + "flos": 500780579328.0, + "grad_norm": 0.06380479300315355, + "language_loss": 0.81160456, + "learning_rate": 0.0002301324418579666, + "loss": 0.8225944, + "num_input_tokens_seen": 297950672, + "router_z_loss_mlp": 0.14562988, + "step": 3592, + "time_per_iteration": 2.7419848442077637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_mlp": 1.02446389, + "epoch": 0.6912273951519815, + "flos": 1409194257408.0, + "grad_norm": 0.013008866579229384, + "language_loss": 0.78688473, + "learning_rate": 0.00022987022707053107, + "loss": 0.79719996, + "num_input_tokens_seen": 298171728, + "router_z_loss_mlp": 0.07080078, + "step": 3593, + "time_per_iteration": 4.783770322799683 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101571, + "balance_loss_mlp": 1.08720624, + "epoch": 0.6914197768372451, + "flos": 635279625216.0, + "grad_norm": 0.07461713066007468, + "language_loss": 0.80640858, + "learning_rate": 0.00022960811715677415, + "loss": 0.8174243, + "num_input_tokens_seen": 298250304, + "router_z_loss_mlp": 0.14355469, + "step": 3594, + "time_per_iteration": 2.8687844276428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101393, + "balance_loss_mlp": 1.08688569, + "epoch": 0.6916121585225087, + "flos": 558044246016.0, + "grad_norm": 0.06773480939306162, + "language_loss": 0.81380737, + "learning_rate": 0.00022934611221845608, + "loss": 0.82482135, + "num_input_tokens_seen": 298328000, + "router_z_loss_mlp": 0.14489746, + "step": 3595, + "time_per_iteration": 2.876573085784912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098147, + "balance_loss_mlp": 1.08306754, + "epoch": 0.6918045402077723, + "flos": 529167748608.0, + "grad_norm": 0.07714844100639354, + "language_loss": 0.78139538, + "learning_rate": 0.00022908421235729609, + "loss": 0.79237688, + "num_input_tokens_seen": 298406832, + "router_z_loss_mlp": 0.1505127, + "step": 3596, + "time_per_iteration": 2.758575439453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.07603967, + "epoch": 0.6919969218930357, + "flos": 570351559680.0, + "grad_norm": 0.07010095160576196, + "language_loss": 0.85004246, + "learning_rate": 0.0002288224176749728, + "loss": 0.86095226, + "num_input_tokens_seen": 298477584, + "router_z_loss_mlp": 0.14904785, + "step": 3597, + "time_per_iteration": 2.6715195178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103476, + "balance_loss_mlp": 1.08851576, + "epoch": 0.6921893035782993, + "flos": 683305196544.0, + "grad_norm": 0.08507252289690358, + "language_loss": 0.78096193, + "learning_rate": 0.00022856072827312385, + "loss": 0.79199672, + "num_input_tokens_seen": 298551872, + "router_z_loss_mlp": 0.14929199, + "step": 3598, + "time_per_iteration": 2.8153061866760254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086452, + "balance_loss_mlp": 1.07144332, + "epoch": 0.6923816852635629, + "flos": 546745324032.0, + "grad_norm": 0.09179482408325199, + "language_loss": 0.76836538, + "learning_rate": 0.00022829914425334598, + "loss": 0.77922994, + "num_input_tokens_seen": 298619680, + "router_z_loss_mlp": 0.14978027, + "step": 3599, + "time_per_iteration": 2.6517763137817383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090108, + "balance_loss_mlp": 1.07546949, + "epoch": 0.6925740669488265, + "flos": 510036300288.0, + "grad_norm": 0.06988561333174233, + "language_loss": 0.80617976, + "learning_rate": 0.0002280376657171956, + "loss": 0.8170808, + "num_input_tokens_seen": 298690080, + "router_z_loss_mlp": 0.14624023, + "step": 3600, + "time_per_iteration": 2.668285369873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090912, + "balance_loss_mlp": 1.075737, + "epoch": 0.69276644863409, + "flos": 869424689664.0, + "grad_norm": 0.0699308267355068, + "language_loss": 0.76665217, + "learning_rate": 0.00022777629276618706, + "loss": 0.77756131, + "num_input_tokens_seen": 298777712, + "router_z_loss_mlp": 0.1517334, + "step": 3601, + "time_per_iteration": 3.14390230178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108857, + "balance_loss_mlp": 1.07352614, + "epoch": 0.6929588303193536, + "flos": 625772086272.0, + "grad_norm": 0.07480870376538759, + "language_loss": 0.77635819, + "learning_rate": 0.0002275150255017947, + "loss": 0.78724384, + "num_input_tokens_seen": 298854368, + "router_z_loss_mlp": 0.15039062, + "step": 3602, + "time_per_iteration": 2.8169686794281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012655, + "balance_loss_mlp": 1.00578892, + "epoch": 0.6931512120046172, + "flos": 1545382996992.0, + "grad_norm": 0.008701900485553249, + "language_loss": 0.75732672, + "learning_rate": 0.0002272538640254511, + "loss": 0.76745325, + "num_input_tokens_seen": 299091664, + "router_z_loss_mlp": 0.06884766, + "step": 3603, + "time_per_iteration": 5.027594327926636 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011913, + "balance_loss_mlp": 1.00504613, + "epoch": 0.6933435936898807, + "flos": 1448230606848.0, + "grad_norm": 0.008216841516263335, + "language_loss": 0.75127101, + "learning_rate": 0.0002269928084385487, + "loss": 0.76139021, + "num_input_tokens_seen": 299312656, + "router_z_loss_mlp": 0.06884766, + "step": 3604, + "time_per_iteration": 4.732091426849365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082938, + "balance_loss_mlp": 1.0677743, + "epoch": 0.6935359753751443, + "flos": 540896901120.0, + "grad_norm": 0.10647094637473072, + "language_loss": 0.84698266, + "learning_rate": 0.0002267318588424379, + "loss": 0.85781205, + "num_input_tokens_seen": 299381136, + "router_z_loss_mlp": 0.15161133, + "step": 3605, + "time_per_iteration": 2.6778976917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081609, + "balance_loss_mlp": 1.06657648, + "epoch": 0.6937283570604078, + "flos": 719396411904.0, + "grad_norm": 0.06584839695977855, + "language_loss": 0.87588215, + "learning_rate": 0.00022647101533842845, + "loss": 0.88669825, + "num_input_tokens_seen": 299455216, + "router_z_loss_mlp": 0.15002441, + "step": 3606, + "time_per_iteration": 2.8986434936523438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080072, + "balance_loss_mlp": 1.06539774, + "epoch": 0.6939207387456714, + "flos": 522165574656.0, + "grad_norm": 0.07095695288657847, + "language_loss": 0.76177275, + "learning_rate": 0.00022621027802778872, + "loss": 0.77257347, + "num_input_tokens_seen": 299524352, + "router_z_loss_mlp": 0.14660645, + "step": 3607, + "time_per_iteration": 2.68804931640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.06613147, + "epoch": 0.694113120430935, + "flos": 535359767040.0, + "grad_norm": 0.08196461827358215, + "language_loss": 0.78305531, + "learning_rate": 0.00022594964701174586, + "loss": 0.79386854, + "num_input_tokens_seen": 299594960, + "router_z_loss_mlp": 0.1517334, + "step": 3608, + "time_per_iteration": 2.6762053966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086087, + "balance_loss_mlp": 1.07163918, + "epoch": 0.6943055021161986, + "flos": 523358972928.0, + "grad_norm": 0.08367512296737743, + "language_loss": 0.84715855, + "learning_rate": 0.00022568912239148586, + "loss": 0.85801935, + "num_input_tokens_seen": 299662560, + "router_z_loss_mlp": 0.14416504, + "step": 3609, + "time_per_iteration": 2.7212107181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080855, + "balance_loss_mlp": 1.06584692, + "epoch": 0.694497883801462, + "flos": 484902982656.0, + "grad_norm": 0.07445866768245664, + "language_loss": 0.81263393, + "learning_rate": 0.00022542870426815344, + "loss": 0.82344246, + "num_input_tokens_seen": 299734896, + "router_z_loss_mlp": 0.14990234, + "step": 3610, + "time_per_iteration": 2.733375072479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010749, + "balance_loss_mlp": 1.06001055, + "epoch": 0.6946902654867256, + "flos": 461474786304.0, + "grad_norm": 0.07362557272852362, + "language_loss": 0.86188352, + "learning_rate": 0.00022516839274285173, + "loss": 0.8726325, + "num_input_tokens_seen": 299799424, + "router_z_loss_mlp": 0.14880371, + "step": 3611, + "time_per_iteration": 2.5730910301208496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078598, + "balance_loss_mlp": 1.06367326, + "epoch": 0.6948826471719892, + "flos": 512855525376.0, + "grad_norm": 0.07586635796694485, + "language_loss": 0.75025129, + "learning_rate": 0.00022490818791664265, + "loss": 0.76103735, + "num_input_tokens_seen": 299868272, + "router_z_loss_mlp": 0.14892578, + "step": 3612, + "time_per_iteration": 2.6340460777282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081262, + "balance_loss_mlp": 1.06613493, + "epoch": 0.6950750288572528, + "flos": 557184531456.0, + "grad_norm": 0.07004728730886566, + "language_loss": 0.855506, + "learning_rate": 0.00022464808989054676, + "loss": 0.8663187, + "num_input_tokens_seen": 299939136, + "router_z_loss_mlp": 0.15100098, + "step": 3613, + "time_per_iteration": 2.691323757171631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_mlp": 1.06455255, + "epoch": 0.6952674105425164, + "flos": 542475740160.0, + "grad_norm": 0.07439341927968558, + "language_loss": 0.76007962, + "learning_rate": 0.00022438809876554284, + "loss": 0.77087599, + "num_input_tokens_seen": 300009472, + "router_z_loss_mlp": 0.15063477, + "step": 3614, + "time_per_iteration": 2.6413824558258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083746, + "balance_loss_mlp": 1.06851149, + "epoch": 0.6954597922277799, + "flos": 546742752768.0, + "grad_norm": 0.07752478508749527, + "language_loss": 0.80230355, + "learning_rate": 0.00022412821464256873, + "loss": 0.81314099, + "num_input_tokens_seen": 300081008, + "router_z_loss_mlp": 0.15209961, + "step": 3615, + "time_per_iteration": 2.697600841522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085665, + "balance_loss_mlp": 1.07031107, + "epoch": 0.6956521739130435, + "flos": 519511905792.0, + "grad_norm": 0.07699011833004216, + "language_loss": 0.82032132, + "learning_rate": 0.00022386843762252023, + "loss": 0.83117795, + "num_input_tokens_seen": 300149856, + "router_z_loss_mlp": 0.15332031, + "step": 3616, + "time_per_iteration": 2.6330502033233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089945, + "balance_loss_mlp": 1.0750314, + "epoch": 0.695844555598307, + "flos": 466275543552.0, + "grad_norm": 0.09639318919512468, + "language_loss": 0.79538012, + "learning_rate": 0.00022360876780625193, + "loss": 0.80627954, + "num_input_tokens_seen": 300217344, + "router_z_loss_mlp": 0.14880371, + "step": 3617, + "time_per_iteration": 2.582629680633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079925, + "balance_loss_mlp": 1.06488085, + "epoch": 0.6960369372835706, + "flos": 600663361536.0, + "grad_norm": 0.056274852551945066, + "language_loss": 0.80103874, + "learning_rate": 0.00022334920529457604, + "loss": 0.81183803, + "num_input_tokens_seen": 300305584, + "router_z_loss_mlp": 0.15026855, + "step": 3618, + "time_per_iteration": 2.936511754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091126, + "balance_loss_mlp": 1.07581925, + "epoch": 0.6962293189688342, + "flos": 644233969152.0, + "grad_norm": 0.07050393221618255, + "language_loss": 0.87297118, + "learning_rate": 0.00022308975018826423, + "loss": 0.8838824, + "num_input_tokens_seen": 300386480, + "router_z_loss_mlp": 0.15283203, + "step": 3619, + "time_per_iteration": 2.8777477741241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086314, + "balance_loss_mlp": 1.07101941, + "epoch": 0.6964217006540977, + "flos": 638810634240.0, + "grad_norm": 0.06699510138661768, + "language_loss": 0.84512174, + "learning_rate": 0.00022283040258804564, + "loss": 0.85598493, + "num_input_tokens_seen": 300461840, + "router_z_loss_mlp": 0.15270996, + "step": 3620, + "time_per_iteration": 2.7737884521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082599, + "balance_loss_mlp": 1.0671612, + "epoch": 0.6966140823393613, + "flos": 652167811584.0, + "grad_norm": 0.06929377823135867, + "language_loss": 0.83519012, + "learning_rate": 0.00022257116259460802, + "loss": 0.84601611, + "num_input_tokens_seen": 300540400, + "router_z_loss_mlp": 0.15429688, + "step": 3621, + "time_per_iteration": 2.904534101486206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081028, + "balance_loss_mlp": 1.06585217, + "epoch": 0.6968064640246249, + "flos": 704492328960.0, + "grad_norm": 0.06749965217673044, + "language_loss": 0.81476158, + "learning_rate": 0.00022231203030859725, + "loss": 0.82557189, + "num_input_tokens_seen": 300624240, + "router_z_loss_mlp": 0.15148926, + "step": 3622, + "time_per_iteration": 2.979004144668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087747, + "balance_loss_mlp": 1.0728699, + "epoch": 0.6969988457098885, + "flos": 492555271680.0, + "grad_norm": 0.10955891307443118, + "language_loss": 0.83551806, + "learning_rate": 0.00022205300583061737, + "loss": 0.84639549, + "num_input_tokens_seen": 300689728, + "router_z_loss_mlp": 0.14855957, + "step": 3623, + "time_per_iteration": 2.5939643383026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0101565, + "balance_loss_mlp": 1.00821149, + "epoch": 0.6971912273951519, + "flos": 1352592442368.0, + "grad_norm": 0.01064219692859378, + "language_loss": 0.82838202, + "learning_rate": 0.00022179408926123063, + "loss": 0.83853853, + "num_input_tokens_seen": 300913152, + "router_z_loss_mlp": 0.07421875, + "step": 3624, + "time_per_iteration": 4.894463539123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084586, + "balance_loss_mlp": 1.0696255, + "epoch": 0.6973836090804155, + "flos": 602459887104.0, + "grad_norm": 0.08187690915242911, + "language_loss": 0.77176243, + "learning_rate": 0.00022153528070095735, + "loss": 0.78260833, + "num_input_tokens_seen": 300985824, + "router_z_loss_mlp": 0.14941406, + "step": 3625, + "time_per_iteration": 2.740964651107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082128, + "balance_loss_mlp": 1.06735802, + "epoch": 0.6975759907656791, + "flos": 524065614336.0, + "grad_norm": 0.07883351153063048, + "language_loss": 0.87864482, + "learning_rate": 0.00022127658025027568, + "loss": 0.88946617, + "num_input_tokens_seen": 301058048, + "router_z_loss_mlp": 0.14758301, + "step": 3626, + "time_per_iteration": 2.694669723510742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081274, + "balance_loss_mlp": 1.06603932, + "epoch": 0.6977683724509427, + "flos": 480912754176.0, + "grad_norm": 0.2474524571141355, + "language_loss": 0.84912121, + "learning_rate": 0.00022101798800962258, + "loss": 0.85993397, + "num_input_tokens_seen": 301127472, + "router_z_loss_mlp": 0.15209961, + "step": 3627, + "time_per_iteration": 2.6004905700683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082967, + "balance_loss_mlp": 1.06798291, + "epoch": 0.6979607541362063, + "flos": 522625167360.0, + "grad_norm": 0.07660061174433377, + "language_loss": 0.7872625, + "learning_rate": 0.00022075950407939227, + "loss": 0.79809219, + "num_input_tokens_seen": 301193920, + "router_z_loss_mlp": 0.14978027, + "step": 3628, + "time_per_iteration": 2.6326966285705566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090002, + "balance_loss_mlp": 1.07531548, + "epoch": 0.6981531358214698, + "flos": 548077114368.0, + "grad_norm": 0.0701967106904507, + "language_loss": 0.82905591, + "learning_rate": 0.0002205011285599367, + "loss": 0.83995599, + "num_input_tokens_seen": 301264256, + "router_z_loss_mlp": 0.14672852, + "step": 3629, + "time_per_iteration": 2.639697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091118, + "balance_loss_mlp": 1.07614517, + "epoch": 0.6983455175067333, + "flos": 700052419584.0, + "grad_norm": 0.06279315859884016, + "language_loss": 0.80564064, + "learning_rate": 0.00022024286155156658, + "loss": 0.8165518, + "num_input_tokens_seen": 301337696, + "router_z_loss_mlp": 0.1496582, + "step": 3630, + "time_per_iteration": 2.8495450019836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108414, + "balance_loss_mlp": 1.06932223, + "epoch": 0.6985378991919969, + "flos": 485078450688.0, + "grad_norm": 0.06855398456951894, + "language_loss": 0.85904682, + "learning_rate": 0.00021998470315454994, + "loss": 0.86988831, + "num_input_tokens_seen": 301407776, + "router_z_loss_mlp": 0.14794922, + "step": 3631, + "time_per_iteration": 2.67564058303833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088432, + "balance_loss_mlp": 1.07317352, + "epoch": 0.6987302808772605, + "flos": 558780622848.0, + "grad_norm": 0.0636105841757025, + "language_loss": 0.86414385, + "learning_rate": 0.00021972665346911275, + "loss": 0.87502813, + "num_input_tokens_seen": 301475120, + "router_z_loss_mlp": 0.15234375, + "step": 3632, + "time_per_iteration": 2.7555418014526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095235, + "balance_loss_mlp": 1.0801785, + "epoch": 0.698922662562524, + "flos": 483593587200.0, + "grad_norm": 0.07511038810381725, + "language_loss": 0.79825956, + "learning_rate": 0.00021946871259543877, + "loss": 0.80921185, + "num_input_tokens_seen": 301542416, + "router_z_loss_mlp": 0.15026855, + "step": 3633, + "time_per_iteration": 2.633380651473999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092543, + "balance_loss_mlp": 1.07767808, + "epoch": 0.6991150442477876, + "flos": 718909655040.0, + "grad_norm": 0.09690309084755197, + "language_loss": 0.82919359, + "learning_rate": 0.00021921088063366957, + "loss": 0.840119, + "num_input_tokens_seen": 301620672, + "router_z_loss_mlp": 0.14831543, + "step": 3634, + "time_per_iteration": 2.9242210388183594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092511, + "balance_loss_mlp": 1.0774194, + "epoch": 0.6993074259330512, + "flos": 489128150016.0, + "grad_norm": 0.0639201840368843, + "language_loss": 0.81773442, + "learning_rate": 0.00021895315768390435, + "loss": 0.82865953, + "num_input_tokens_seen": 301688016, + "router_z_loss_mlp": 0.15063477, + "step": 3635, + "time_per_iteration": 2.6489744186401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095535, + "balance_loss_mlp": 1.0803355, + "epoch": 0.6994998076183148, + "flos": 718089214464.0, + "grad_norm": 0.060489852807190235, + "language_loss": 0.87983084, + "learning_rate": 0.00021869554384619999, + "loss": 0.89078617, + "num_input_tokens_seen": 301771184, + "router_z_loss_mlp": 0.15185547, + "step": 3636, + "time_per_iteration": 3.0024359226226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090766, + "balance_loss_mlp": 1.07566249, + "epoch": 0.6996921893035783, + "flos": 579016636416.0, + "grad_norm": 0.08372148054785959, + "language_loss": 0.80742836, + "learning_rate": 0.00021843803922057115, + "loss": 0.81833601, + "num_input_tokens_seen": 301844528, + "router_z_loss_mlp": 0.15075684, + "step": 3637, + "time_per_iteration": 2.7597806453704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099553, + "balance_loss_mlp": 1.08446145, + "epoch": 0.6998845709888418, + "flos": 518629796352.0, + "grad_norm": 0.07613673241424718, + "language_loss": 0.81840616, + "learning_rate": 0.00021818064390698977, + "loss": 0.82940167, + "num_input_tokens_seen": 301914960, + "router_z_loss_mlp": 0.15075684, + "step": 3638, + "time_per_iteration": 2.662210702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097696, + "balance_loss_mlp": 1.08278298, + "epoch": 0.7000769526741054, + "flos": 620951505408.0, + "grad_norm": 0.0762563380177704, + "language_loss": 0.86943358, + "learning_rate": 0.0002179233580053861, + "loss": 0.88041055, + "num_input_tokens_seen": 301986352, + "router_z_loss_mlp": 0.14892578, + "step": 3639, + "time_per_iteration": 2.76003098487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088923, + "balance_loss_mlp": 1.0736047, + "epoch": 0.700269334359369, + "flos": 559946856960.0, + "grad_norm": 0.06684483131763276, + "language_loss": 0.85643017, + "learning_rate": 0.00021766618161564688, + "loss": 0.86731935, + "num_input_tokens_seen": 302060544, + "router_z_loss_mlp": 0.1529541, + "step": 3640, + "time_per_iteration": 2.710590362548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089129, + "balance_loss_mlp": 1.07377481, + "epoch": 0.7004617160446326, + "flos": 483343967232.0, + "grad_norm": 0.08652937172490481, + "language_loss": 0.87291199, + "learning_rate": 0.00021740911483761677, + "loss": 0.88380325, + "num_input_tokens_seen": 302127232, + "router_z_loss_mlp": 0.15344238, + "step": 3641, + "time_per_iteration": 2.587820529937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108697, + "balance_loss_mlp": 1.0719738, + "epoch": 0.7006540977298961, + "flos": 696981003264.0, + "grad_norm": 0.05890602185122373, + "language_loss": 0.92162985, + "learning_rate": 0.00021715215777109837, + "loss": 0.93249953, + "num_input_tokens_seen": 302207056, + "router_z_loss_mlp": 0.14978027, + "step": 3642, + "time_per_iteration": 2.9837920665740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087066, + "balance_loss_mlp": 1.07216477, + "epoch": 0.7008464794151597, + "flos": 504775950336.0, + "grad_norm": 0.0660113097105393, + "language_loss": 0.84073913, + "learning_rate": 0.00021689531051585103, + "loss": 0.85160977, + "num_input_tokens_seen": 302275632, + "router_z_loss_mlp": 0.14904785, + "step": 3643, + "time_per_iteration": 2.5994443893432617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083136, + "balance_loss_mlp": 1.06766284, + "epoch": 0.7010388611004232, + "flos": 537242554368.0, + "grad_norm": 0.08730620791306808, + "language_loss": 0.80473441, + "learning_rate": 0.00021663857317159196, + "loss": 0.81556571, + "num_input_tokens_seen": 302343600, + "router_z_loss_mlp": 0.15454102, + "step": 3644, + "time_per_iteration": 2.636361837387085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089327, + "balance_loss_mlp": 1.07431817, + "epoch": 0.7012312427856868, + "flos": 547259245056.0, + "grad_norm": 0.07432760793631779, + "language_loss": 0.82087952, + "learning_rate": 0.00021638194583799487, + "loss": 0.8317728, + "num_input_tokens_seen": 302414656, + "router_z_loss_mlp": 0.14978027, + "step": 3645, + "time_per_iteration": 2.697885513305664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082052, + "balance_loss_mlp": 1.06686449, + "epoch": 0.7014236244709504, + "flos": 941409630720.0, + "grad_norm": 0.07667470628550804, + "language_loss": 0.82340956, + "learning_rate": 0.00021612542861469176, + "loss": 0.83423007, + "num_input_tokens_seen": 302495120, + "router_z_loss_mlp": 0.15185547, + "step": 3646, + "time_per_iteration": 3.2449471950531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075293, + "balance_loss_mlp": 1.06002283, + "epoch": 0.7016160061562139, + "flos": 525167608320.0, + "grad_norm": 0.08774418992406267, + "language_loss": 0.82529956, + "learning_rate": 0.00021586902160127135, + "loss": 0.83605254, + "num_input_tokens_seen": 302563024, + "router_z_loss_mlp": 0.15246582, + "step": 3647, + "time_per_iteration": 2.6226844787597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086185, + "balance_loss_mlp": 1.07074714, + "epoch": 0.7018083878414775, + "flos": 373385023488.0, + "grad_norm": 0.12454789428341487, + "language_loss": 0.73784959, + "learning_rate": 0.00021561272489727974, + "loss": 0.74871147, + "num_input_tokens_seen": 302624544, + "router_z_loss_mlp": 0.15429688, + "step": 3648, + "time_per_iteration": 2.445005178451538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088543, + "balance_loss_mlp": 1.07320118, + "epoch": 0.7020007695267411, + "flos": 527784201216.0, + "grad_norm": 0.07345436564624129, + "language_loss": 0.79976106, + "learning_rate": 0.0002153565386022199, + "loss": 0.81064653, + "num_input_tokens_seen": 302697856, + "router_z_loss_mlp": 0.15332031, + "step": 3649, + "time_per_iteration": 2.656079053878784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107984, + "balance_loss_mlp": 1.06450915, + "epoch": 0.7021931512120047, + "flos": 690154297344.0, + "grad_norm": 0.07891543981767615, + "language_loss": 0.82497263, + "learning_rate": 0.00021510046281555262, + "loss": 0.83577102, + "num_input_tokens_seen": 302771984, + "router_z_loss_mlp": 0.15307617, + "step": 3650, + "time_per_iteration": 2.8389041423797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108619, + "balance_loss_mlp": 1.0712415, + "epoch": 0.7023855328972681, + "flos": 639784147968.0, + "grad_norm": 0.08322895667729725, + "language_loss": 0.82151127, + "learning_rate": 0.0002148444976366949, + "loss": 0.83237314, + "num_input_tokens_seen": 302838832, + "router_z_loss_mlp": 0.14929199, + "step": 3651, + "time_per_iteration": 2.7878010272979736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088068, + "balance_loss_mlp": 1.07308304, + "epoch": 0.7025779145825317, + "flos": 560940194304.0, + "grad_norm": 0.09064059041024937, + "language_loss": 0.82483077, + "learning_rate": 0.00021458864316502136, + "loss": 0.83571148, + "num_input_tokens_seen": 302909952, + "router_z_loss_mlp": 0.1496582, + "step": 3652, + "time_per_iteration": 2.7618918418884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081972, + "balance_loss_mlp": 1.06670094, + "epoch": 0.7027702962677953, + "flos": 447445472256.0, + "grad_norm": 0.07081207687484876, + "language_loss": 0.87084836, + "learning_rate": 0.0002143328994998634, + "loss": 0.88166809, + "num_input_tokens_seen": 302973056, + "router_z_loss_mlp": 0.15258789, + "step": 3653, + "time_per_iteration": 2.510607957839966 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082487, + "balance_loss_mlp": 1.06681085, + "epoch": 0.7029626779530589, + "flos": 622500609024.0, + "grad_norm": 0.07138431513844615, + "language_loss": 0.78192198, + "learning_rate": 0.00021407726674050982, + "loss": 0.7927469, + "num_input_tokens_seen": 303054656, + "router_z_loss_mlp": 0.15661621, + "step": 3654, + "time_per_iteration": 2.917117118835449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087015, + "balance_loss_mlp": 1.07145858, + "epoch": 0.7031550596383225, + "flos": 629591989248.0, + "grad_norm": 0.05934864829913755, + "language_loss": 0.87179619, + "learning_rate": 0.0002138217449862061, + "loss": 0.88266635, + "num_input_tokens_seen": 303124256, + "router_z_loss_mlp": 0.15539551, + "step": 3655, + "time_per_iteration": 2.756298542022705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079642, + "balance_loss_mlp": 1.06409764, + "epoch": 0.703347441323586, + "flos": 530843134464.0, + "grad_norm": 0.05984331693080437, + "language_loss": 0.78077435, + "learning_rate": 0.00021356633433615403, + "loss": 0.79157078, + "num_input_tokens_seen": 303192720, + "router_z_loss_mlp": 0.15527344, + "step": 3656, + "time_per_iteration": 2.5978493690490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107548, + "balance_loss_mlp": 1.06014955, + "epoch": 0.7035398230088495, + "flos": 693593528832.0, + "grad_norm": 0.058151360566504745, + "language_loss": 0.83692706, + "learning_rate": 0.0002133110348895133, + "loss": 0.84768182, + "num_input_tokens_seen": 303275968, + "router_z_loss_mlp": 0.15307617, + "step": 3657, + "time_per_iteration": 2.959099769592285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_mlp": 1.06172252, + "epoch": 0.7037322046941131, + "flos": 968035152384.0, + "grad_norm": 0.06222478003101834, + "language_loss": 0.84750932, + "learning_rate": 0.0002130558467453999, + "loss": 0.85828042, + "num_input_tokens_seen": 303367296, + "router_z_loss_mlp": 0.15368652, + "step": 3658, + "time_per_iteration": 3.370732545852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078328, + "balance_loss_mlp": 1.0626049, + "epoch": 0.7039245863793767, + "flos": 502863427584.0, + "grad_norm": 0.06594992598251542, + "language_loss": 0.84501821, + "learning_rate": 0.0002128007700028865, + "loss": 0.85580146, + "num_input_tokens_seen": 303442768, + "router_z_loss_mlp": 0.15710449, + "step": 3659, + "time_per_iteration": 2.7245702743530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069448, + "balance_loss_mlp": 1.05422533, + "epoch": 0.7041169680646402, + "flos": 465954342912.0, + "grad_norm": 0.08946749020423889, + "language_loss": 0.84478891, + "learning_rate": 0.00021254580476100276, + "loss": 0.85548341, + "num_input_tokens_seen": 303508304, + "router_z_loss_mlp": 0.15209961, + "step": 3660, + "time_per_iteration": 2.5659265518188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074702, + "balance_loss_mlp": 1.05915749, + "epoch": 0.7043093497499038, + "flos": 632181417984.0, + "grad_norm": 0.06878141726007914, + "language_loss": 0.78906703, + "learning_rate": 0.00021229095111873497, + "loss": 0.79981405, + "num_input_tokens_seen": 303579312, + "router_z_loss_mlp": 0.15527344, + "step": 3661, + "time_per_iteration": 2.76365327835083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064044, + "balance_loss_mlp": 1.04833269, + "epoch": 0.7045017314351674, + "flos": 542930190336.0, + "grad_norm": 0.07470763388511147, + "language_loss": 0.86035001, + "learning_rate": 0.0002120362091750261, + "loss": 0.87099046, + "num_input_tokens_seen": 303658384, + "router_z_loss_mlp": 0.15698242, + "step": 3662, + "time_per_iteration": 2.770721197128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072624, + "balance_loss_mlp": 1.05705488, + "epoch": 0.704694113120431, + "flos": 428237300736.0, + "grad_norm": 0.09281609686454934, + "language_loss": 0.87091279, + "learning_rate": 0.00021178157902877566, + "loss": 0.881639, + "num_input_tokens_seen": 303721136, + "router_z_loss_mlp": 0.15551758, + "step": 3663, + "time_per_iteration": 2.5207157135009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066204, + "balance_loss_mlp": 1.05053949, + "epoch": 0.7048864948056945, + "flos": 650544556032.0, + "grad_norm": 0.09122726068806429, + "language_loss": 0.86906433, + "learning_rate": 0.0002115270607788397, + "loss": 0.87972641, + "num_input_tokens_seen": 303792368, + "router_z_loss_mlp": 0.15661621, + "step": 3664, + "time_per_iteration": 2.7837629318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066813, + "balance_loss_mlp": 1.05149484, + "epoch": 0.705078876490958, + "flos": 412562336256.0, + "grad_norm": 0.07041201506359947, + "language_loss": 0.85376197, + "learning_rate": 0.00021127265452403133, + "loss": 0.86443013, + "num_input_tokens_seen": 303856336, + "router_z_loss_mlp": 0.15307617, + "step": 3665, + "time_per_iteration": 2.5382044315338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036955, + "balance_loss_mlp": 1.02923036, + "epoch": 0.7052712581762216, + "flos": 1420040927232.0, + "grad_norm": 0.02225212280598243, + "language_loss": 0.84091628, + "learning_rate": 0.0002110183603631199, + "loss": 0.85128582, + "num_input_tokens_seen": 304089856, + "router_z_loss_mlp": 0.07714844, + "step": 3666, + "time_per_iteration": 4.855606317520142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065613, + "balance_loss_mlp": 1.04975796, + "epoch": 0.7054636398614852, + "flos": 493049369088.0, + "grad_norm": 0.07067932110848238, + "language_loss": 0.82453668, + "learning_rate": 0.00021076417839483065, + "loss": 0.8351928, + "num_input_tokens_seen": 304164752, + "router_z_loss_mlp": 0.15844727, + "step": 3667, + "time_per_iteration": 2.784029960632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063329, + "balance_loss_mlp": 1.04703355, + "epoch": 0.7056560215467488, + "flos": 450457417728.0, + "grad_norm": 0.06070170414255382, + "language_loss": 0.84920627, + "learning_rate": 0.00021051010871784589, + "loss": 0.85983962, + "num_input_tokens_seen": 304229568, + "router_z_loss_mlp": 0.16296387, + "step": 3668, + "time_per_iteration": 2.5824825763702393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061908, + "balance_loss_mlp": 1.04632711, + "epoch": 0.7058484032320124, + "flos": 565703875584.0, + "grad_norm": 0.06560783943853528, + "language_loss": 0.7931717, + "learning_rate": 0.0002102561514308045, + "loss": 0.80379081, + "num_input_tokens_seen": 304299408, + "router_z_loss_mlp": 0.15563965, + "step": 3669, + "time_per_iteration": 2.754573345184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064666, + "balance_loss_mlp": 1.04950261, + "epoch": 0.7060407849172758, + "flos": 567008501760.0, + "grad_norm": 0.0735599697631235, + "language_loss": 0.82317781, + "learning_rate": 0.00021000230663230135, + "loss": 0.83382452, + "num_input_tokens_seen": 304367936, + "router_z_loss_mlp": 0.15148926, + "step": 3670, + "time_per_iteration": 2.7335312366485596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057554, + "balance_loss_mlp": 1.04233122, + "epoch": 0.7062331666025394, + "flos": 468746403840.0, + "grad_norm": 0.08649275144444013, + "language_loss": 0.82978272, + "learning_rate": 0.00020974857442088762, + "loss": 0.84035832, + "num_input_tokens_seen": 304438368, + "router_z_loss_mlp": 0.15197754, + "step": 3671, + "time_per_iteration": 2.5915567874908447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062839, + "balance_loss_mlp": 1.04766369, + "epoch": 0.706425548287803, + "flos": 595316749824.0, + "grad_norm": 0.13981263765851287, + "language_loss": 0.88996911, + "learning_rate": 0.00020949495489507104, + "loss": 0.90059757, + "num_input_tokens_seen": 304508720, + "router_z_loss_mlp": 0.15148926, + "step": 3672, + "time_per_iteration": 2.679868459701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067551, + "balance_loss_mlp": 1.05241179, + "epoch": 0.7066179299730666, + "flos": 475815389184.0, + "grad_norm": 0.08084311033907006, + "language_loss": 0.84611428, + "learning_rate": 0.00020924144815331525, + "loss": 0.85678977, + "num_input_tokens_seen": 304576128, + "router_z_loss_mlp": 0.15124512, + "step": 3673, + "time_per_iteration": 2.5828816890716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066376, + "balance_loss_mlp": 1.05085516, + "epoch": 0.7068103116583301, + "flos": 506409117696.0, + "grad_norm": 0.07749570003659609, + "language_loss": 0.83121467, + "learning_rate": 0.00020898805429404044, + "loss": 0.84187841, + "num_input_tokens_seen": 304642416, + "router_z_loss_mlp": 0.1550293, + "step": 3674, + "time_per_iteration": 2.6209206581115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067061, + "balance_loss_mlp": 1.05180216, + "epoch": 0.7070026933435937, + "flos": 679336989696.0, + "grad_norm": 0.08324875322502746, + "language_loss": 0.78500605, + "learning_rate": 0.0002087347734156228, + "loss": 0.79567671, + "num_input_tokens_seen": 304719312, + "router_z_loss_mlp": 0.15234375, + "step": 3675, + "time_per_iteration": 2.8822014331817627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065667, + "balance_loss_mlp": 1.05075419, + "epoch": 0.7071950750288573, + "flos": 472217942016.0, + "grad_norm": 0.07260496319451265, + "language_loss": 0.79725403, + "learning_rate": 0.00020848160561639452, + "loss": 0.80791068, + "num_input_tokens_seen": 304789296, + "router_z_loss_mlp": 0.14904785, + "step": 3676, + "time_per_iteration": 2.6594197750091553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067782, + "balance_loss_mlp": 1.05267811, + "epoch": 0.7073874567141208, + "flos": 473742452736.0, + "grad_norm": 0.07068166110728066, + "language_loss": 0.86114025, + "learning_rate": 0.0002082285509946445, + "loss": 0.87181818, + "num_input_tokens_seen": 304854320, + "router_z_loss_mlp": 0.15087891, + "step": 3677, + "time_per_iteration": 2.607058525085449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071327, + "balance_loss_mlp": 1.05596066, + "epoch": 0.7075798383993844, + "flos": 545877895680.0, + "grad_norm": 0.07275047066851685, + "language_loss": 0.83149093, + "learning_rate": 0.00020797560964861683, + "loss": 0.84220415, + "num_input_tokens_seen": 304932784, + "router_z_loss_mlp": 0.15344238, + "step": 3678, + "time_per_iteration": 2.7852048873901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107009, + "balance_loss_mlp": 1.05486727, + "epoch": 0.7077722200846479, + "flos": 662090526720.0, + "grad_norm": 0.18523634613836037, + "language_loss": 0.80623943, + "learning_rate": 0.0002077227816765122, + "loss": 0.81694031, + "num_input_tokens_seen": 305018080, + "router_z_loss_mlp": 0.15197754, + "step": 3679, + "time_per_iteration": 3.0609920024871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020725, + "balance_loss_mlp": 1.01333392, + "epoch": 0.7079646017699115, + "flos": 1529960223744.0, + "grad_norm": 0.017379131221616127, + "language_loss": 0.76447725, + "learning_rate": 0.0002074700671764869, + "loss": 0.77468443, + "num_input_tokens_seen": 305241216, + "router_z_loss_mlp": 0.07373047, + "step": 3680, + "time_per_iteration": 4.8490447998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062109, + "balance_loss_mlp": 1.04688621, + "epoch": 0.7081569834551751, + "flos": 621502502400.0, + "grad_norm": 0.06739909135454819, + "language_loss": 0.78692472, + "learning_rate": 0.00020721746624665383, + "loss": 0.79754579, + "num_input_tokens_seen": 305311376, + "router_z_loss_mlp": 0.15197754, + "step": 3681, + "time_per_iteration": 2.73974609375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106588, + "balance_loss_mlp": 1.05075181, + "epoch": 0.7083493651404387, + "flos": 794630435328.0, + "grad_norm": 0.061911135339539125, + "language_loss": 0.80153, + "learning_rate": 0.00020696497898508114, + "loss": 0.8121888, + "num_input_tokens_seen": 305392736, + "router_z_loss_mlp": 0.15100098, + "step": 3682, + "time_per_iteration": 3.0617854595184326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066388, + "balance_loss_mlp": 1.050915, + "epoch": 0.7085417468257021, + "flos": 813747202560.0, + "grad_norm": 0.0963650064314711, + "language_loss": 0.77652842, + "learning_rate": 0.00020671260548979316, + "loss": 0.78719234, + "num_input_tokens_seen": 305470896, + "router_z_loss_mlp": 0.15454102, + "step": 3683, + "time_per_iteration": 3.0387070178985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070402, + "balance_loss_mlp": 1.05497599, + "epoch": 0.7087341285109657, + "flos": 700566340608.0, + "grad_norm": 0.07537323093447403, + "language_loss": 0.85192174, + "learning_rate": 0.00020646034585876982, + "loss": 0.86262578, + "num_input_tokens_seen": 305547072, + "router_z_loss_mlp": 0.1541748, + "step": 3684, + "time_per_iteration": 2.8481369018554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067395, + "balance_loss_mlp": 1.05154002, + "epoch": 0.7089265101962293, + "flos": 596514917376.0, + "grad_norm": 0.07238379528702499, + "language_loss": 0.84238535, + "learning_rate": 0.00020620820018994718, + "loss": 0.85305929, + "num_input_tokens_seen": 305624512, + "router_z_loss_mlp": 0.15844727, + "step": 3685, + "time_per_iteration": 2.850832462310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070547, + "balance_loss_mlp": 1.05496585, + "epoch": 0.7091188918814929, + "flos": 487106970624.0, + "grad_norm": 0.08929254247711407, + "language_loss": 0.82788908, + "learning_rate": 0.00020595616858121675, + "loss": 0.83859456, + "num_input_tokens_seen": 305695088, + "router_z_loss_mlp": 0.15563965, + "step": 3686, + "time_per_iteration": 2.6979212760925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069674, + "balance_loss_mlp": 1.05395079, + "epoch": 0.7093112735667565, + "flos": 600117507072.0, + "grad_norm": 0.06683720470711539, + "language_loss": 0.80962205, + "learning_rate": 0.00020570425113042586, + "loss": 0.82031882, + "num_input_tokens_seen": 305763680, + "router_z_loss_mlp": 0.15710449, + "step": 3687, + "time_per_iteration": 2.781977415084839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073663, + "balance_loss_mlp": 1.05815399, + "epoch": 0.70950365525202, + "flos": 505830956544.0, + "grad_norm": 0.08176203647633842, + "language_loss": 0.85817683, + "learning_rate": 0.0002054524479353776, + "loss": 0.86891353, + "num_input_tokens_seen": 305835008, + "router_z_loss_mlp": 0.1550293, + "step": 3688, + "time_per_iteration": 2.7264649868011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107422, + "balance_loss_mlp": 1.05829346, + "epoch": 0.7096960369372836, + "flos": 732160747008.0, + "grad_norm": 0.07836614397127288, + "language_loss": 0.81732869, + "learning_rate": 0.00020520075909383063, + "loss": 0.82807088, + "num_input_tokens_seen": 305909072, + "router_z_loss_mlp": 0.15917969, + "step": 3689, + "time_per_iteration": 2.8794634342193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074673, + "balance_loss_mlp": 1.05887747, + "epoch": 0.7098884186225471, + "flos": 972077511168.0, + "grad_norm": 0.06452769831785021, + "language_loss": 0.80728209, + "learning_rate": 0.00020494918470349916, + "loss": 0.81802881, + "num_input_tokens_seen": 305994752, + "router_z_loss_mlp": 0.15783691, + "step": 3690, + "time_per_iteration": 3.310556173324585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073435, + "balance_loss_mlp": 1.05783021, + "epoch": 0.7100808003078107, + "flos": 504252117504.0, + "grad_norm": 0.07986210521804603, + "language_loss": 0.85468179, + "learning_rate": 0.00020469772486205297, + "loss": 0.86541611, + "num_input_tokens_seen": 306062960, + "router_z_loss_mlp": 0.15588379, + "step": 3691, + "time_per_iteration": 2.6372458934783936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073806, + "balance_loss_mlp": 1.05890524, + "epoch": 0.7102731819930742, + "flos": 540335992320.0, + "grad_norm": 0.0774052521314589, + "language_loss": 0.80950189, + "learning_rate": 0.0002044463796671177, + "loss": 0.82023996, + "num_input_tokens_seen": 306134224, + "router_z_loss_mlp": 0.14880371, + "step": 3692, + "time_per_iteration": 2.7093636989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076355, + "balance_loss_mlp": 1.06070268, + "epoch": 0.7104655636783378, + "flos": 620378113536.0, + "grad_norm": 0.09666696589873951, + "language_loss": 0.80422229, + "learning_rate": 0.00020419514921627408, + "loss": 0.81498581, + "num_input_tokens_seen": 306214512, + "router_z_loss_mlp": 0.15649414, + "step": 3693, + "time_per_iteration": 2.8826653957366943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076233, + "balance_loss_mlp": 1.06092691, + "epoch": 0.7106579453636014, + "flos": 557322923520.0, + "grad_norm": 0.09593640635946206, + "language_loss": 0.77400964, + "learning_rate": 0.00020394403360705855, + "loss": 0.78477204, + "num_input_tokens_seen": 306283232, + "router_z_loss_mlp": 0.15283203, + "step": 3694, + "time_per_iteration": 2.711338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073674, + "balance_loss_mlp": 1.05866575, + "epoch": 0.710850327048865, + "flos": 513048245760.0, + "grad_norm": 0.07513000367190234, + "language_loss": 0.87831378, + "learning_rate": 0.00020369303293696228, + "loss": 0.88905054, + "num_input_tokens_seen": 306351536, + "router_z_loss_mlp": 0.15002441, + "step": 3695, + "time_per_iteration": 2.6499857902526855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076787, + "balance_loss_mlp": 1.06118226, + "epoch": 0.7110427087341286, + "flos": 423619352064.0, + "grad_norm": 0.09128032628418083, + "language_loss": 0.78371423, + "learning_rate": 0.00020344214730343304, + "loss": 0.79448211, + "num_input_tokens_seen": 306419040, + "router_z_loss_mlp": 0.15588379, + "step": 3696, + "time_per_iteration": 2.6158998012542725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066913, + "balance_loss_mlp": 1.05185723, + "epoch": 0.711235090419392, + "flos": 577415402496.0, + "grad_norm": 0.06490931607854103, + "language_loss": 0.79312873, + "learning_rate": 0.00020319137680387296, + "loss": 0.80379784, + "num_input_tokens_seen": 306503248, + "router_z_loss_mlp": 0.15039062, + "step": 3697, + "time_per_iteration": 2.949493646621704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068106, + "balance_loss_mlp": 1.05243063, + "epoch": 0.7114274721046556, + "flos": 448060709376.0, + "grad_norm": 0.07559912966503037, + "language_loss": 0.80551994, + "learning_rate": 0.0002029407215356398, + "loss": 0.81620097, + "num_input_tokens_seen": 306566288, + "router_z_loss_mlp": 0.15673828, + "step": 3698, + "time_per_iteration": 2.5261340141296387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070696, + "balance_loss_mlp": 1.0556761, + "epoch": 0.7116198537899192, + "flos": 621962095104.0, + "grad_norm": 0.07643567713665894, + "language_loss": 0.83342177, + "learning_rate": 0.00020269018159604663, + "loss": 0.84412873, + "num_input_tokens_seen": 306633344, + "router_z_loss_mlp": 0.15002441, + "step": 3699, + "time_per_iteration": 2.7795286178588867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069252, + "balance_loss_mlp": 1.05404091, + "epoch": 0.7118122354751828, + "flos": 498724895232.0, + "grad_norm": 0.06553173563730097, + "language_loss": 0.82171476, + "learning_rate": 0.00020243975708236162, + "loss": 0.83240736, + "num_input_tokens_seen": 306701328, + "router_z_loss_mlp": 0.15197754, + "step": 3700, + "time_per_iteration": 2.603534698486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066013, + "balance_loss_mlp": 1.05129027, + "epoch": 0.7120046171604463, + "flos": 572718532608.0, + "grad_norm": 0.07521702556055786, + "language_loss": 0.86229205, + "learning_rate": 0.00020218944809180818, + "loss": 0.87295222, + "num_input_tokens_seen": 306773168, + "router_z_loss_mlp": 0.14709473, + "step": 3701, + "time_per_iteration": 2.7194745540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072931, + "balance_loss_mlp": 1.05825663, + "epoch": 0.7121969988457099, + "flos": 572664204288.0, + "grad_norm": 0.06709763936599906, + "language_loss": 0.84454715, + "learning_rate": 0.00020193925472156493, + "loss": 0.85527647, + "num_input_tokens_seen": 306845312, + "router_z_loss_mlp": 0.14648438, + "step": 3702, + "time_per_iteration": 2.6942999362945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008682, + "balance_loss_mlp": 1.00162458, + "epoch": 0.7123893805309734, + "flos": 1523429752320.0, + "grad_norm": 0.007804959242713824, + "language_loss": 0.74289167, + "learning_rate": 0.00020168917706876537, + "loss": 0.75297856, + "num_input_tokens_seen": 307079216, + "router_z_loss_mlp": 0.07080078, + "step": 3703, + "time_per_iteration": 4.9340033531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073126, + "balance_loss_mlp": 1.05801082, + "epoch": 0.712581762216237, + "flos": 615105280512.0, + "grad_norm": 0.05932039995937275, + "language_loss": 0.83487558, + "learning_rate": 0.00020143921523049863, + "loss": 0.8456068, + "num_input_tokens_seen": 307163568, + "router_z_loss_mlp": 0.15087891, + "step": 3704, + "time_per_iteration": 2.9551632404327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069307, + "balance_loss_mlp": 1.05464458, + "epoch": 0.7127741439015006, + "flos": 597777698304.0, + "grad_norm": 0.08724240459453055, + "language_loss": 0.84004354, + "learning_rate": 0.00020118936930380837, + "loss": 0.85073662, + "num_input_tokens_seen": 307232800, + "router_z_loss_mlp": 0.14648438, + "step": 3705, + "time_per_iteration": 2.7111411094665527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076471, + "balance_loss_mlp": 1.06118834, + "epoch": 0.7129665255867641, + "flos": 537398198784.0, + "grad_norm": 0.07870920964767068, + "language_loss": 0.81005669, + "learning_rate": 0.0002009396393856932, + "loss": 0.8208214, + "num_input_tokens_seen": 307307216, + "router_z_loss_mlp": 0.15258789, + "step": 3706, + "time_per_iteration": 2.6393184661865234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066656, + "balance_loss_mlp": 1.05216026, + "epoch": 0.7131589072720277, + "flos": 526442499072.0, + "grad_norm": 0.07318964523896145, + "language_loss": 0.82600415, + "learning_rate": 0.00020069002557310673, + "loss": 0.83667076, + "num_input_tokens_seen": 307377472, + "router_z_loss_mlp": 0.14489746, + "step": 3707, + "time_per_iteration": 2.6772639751434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072653, + "balance_loss_mlp": 1.05814552, + "epoch": 0.7133512889572913, + "flos": 530919484416.0, + "grad_norm": 0.07177417053669936, + "language_loss": 0.76892489, + "learning_rate": 0.00020044052796295807, + "loss": 0.77965146, + "num_input_tokens_seen": 307456880, + "router_z_loss_mlp": 0.14501953, + "step": 3708, + "time_per_iteration": 2.8213729858398438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068946, + "balance_loss_mlp": 1.05386651, + "epoch": 0.7135436706425549, + "flos": 503535564288.0, + "grad_norm": 0.08204040588858591, + "language_loss": 0.81975353, + "learning_rate": 0.00020019114665211063, + "loss": 0.83044302, + "num_input_tokens_seen": 307524784, + "router_z_loss_mlp": 0.1505127, + "step": 3709, + "time_per_iteration": 2.622581958770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069621, + "balance_loss_mlp": 1.0548985, + "epoch": 0.7137360523278183, + "flos": 515968786944.0, + "grad_norm": 0.1809650911769107, + "language_loss": 0.81334156, + "learning_rate": 0.00019994188173738276, + "loss": 0.82403779, + "num_input_tokens_seen": 307591408, + "router_z_loss_mlp": 0.14697266, + "step": 3710, + "time_per_iteration": 2.6591386795043945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070639, + "balance_loss_mlp": 1.05586886, + "epoch": 0.7139284340130819, + "flos": 510389434368.0, + "grad_norm": 0.07384437980034154, + "language_loss": 0.80407298, + "learning_rate": 0.0001996927333155477, + "loss": 0.81477934, + "num_input_tokens_seen": 307662912, + "router_z_loss_mlp": 0.14758301, + "step": 3711, + "time_per_iteration": 2.8079118728637695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075523, + "balance_loss_mlp": 1.06068099, + "epoch": 0.7141208156983455, + "flos": 890275940352.0, + "grad_norm": 0.06892114468166954, + "language_loss": 0.85343927, + "learning_rate": 0.00019944370148333346, + "loss": 0.86419451, + "num_input_tokens_seen": 307752256, + "router_z_loss_mlp": 0.14819336, + "step": 3712, + "time_per_iteration": 3.1857290267944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072063, + "balance_loss_mlp": 1.0572927, + "epoch": 0.7143131973836091, + "flos": 535779712512.0, + "grad_norm": 0.07489369079916172, + "language_loss": 0.79687518, + "learning_rate": 0.00019919478633742278, + "loss": 0.80759573, + "num_input_tokens_seen": 307821504, + "router_z_loss_mlp": 0.14758301, + "step": 3713, + "time_per_iteration": 2.705082416534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077145, + "balance_loss_mlp": 1.06194544, + "epoch": 0.7145055790688727, + "flos": 473668300800.0, + "grad_norm": 0.08783705919644806, + "language_loss": 0.85156208, + "learning_rate": 0.00019894598797445302, + "loss": 0.86233354, + "num_input_tokens_seen": 307886464, + "router_z_loss_mlp": 0.15185547, + "step": 3714, + "time_per_iteration": 2.5540032386779785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072516, + "balance_loss_mlp": 1.05732846, + "epoch": 0.7146979607541362, + "flos": 570521885184.0, + "grad_norm": 0.06443194669340387, + "language_loss": 0.81776547, + "learning_rate": 0.00019869730649101615, + "loss": 0.82849067, + "num_input_tokens_seen": 307962736, + "router_z_loss_mlp": 0.15161133, + "step": 3715, + "time_per_iteration": 2.811156988143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074394, + "balance_loss_mlp": 1.05909991, + "epoch": 0.7148903424393998, + "flos": 839666082816.0, + "grad_norm": 0.07982240605965638, + "language_loss": 0.72529298, + "learning_rate": 0.00019844874198365943, + "loss": 0.7360369, + "num_input_tokens_seen": 308046592, + "router_z_loss_mlp": 0.15283203, + "step": 3716, + "time_per_iteration": 3.1387200355529785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076276, + "balance_loss_mlp": 1.06136334, + "epoch": 0.7150827241246633, + "flos": 541823427072.0, + "grad_norm": 0.09017082219564719, + "language_loss": 0.83709008, + "learning_rate": 0.00019820029454888362, + "loss": 0.84785283, + "num_input_tokens_seen": 308119920, + "router_z_loss_mlp": 0.14892578, + "step": 3717, + "time_per_iteration": 2.7234127521514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022761, + "balance_loss_mlp": 1.01575112, + "epoch": 0.7152751058099269, + "flos": 1583678200320.0, + "grad_norm": 0.012614936180071102, + "language_loss": 0.74521267, + "learning_rate": 0.00019795196428314455, + "loss": 0.75544029, + "num_input_tokens_seen": 308361024, + "router_z_loss_mlp": 0.0703125, + "step": 3718, + "time_per_iteration": 5.082587957382202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079514, + "balance_loss_mlp": 1.06449401, + "epoch": 0.7154674874951905, + "flos": 517419145728.0, + "grad_norm": 0.07146981792263798, + "language_loss": 0.80162418, + "learning_rate": 0.0001977037512828529, + "loss": 0.8124193, + "num_input_tokens_seen": 308429808, + "router_z_loss_mlp": 0.15002441, + "step": 3719, + "time_per_iteration": 2.6214728355407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071959, + "balance_loss_mlp": 1.05687928, + "epoch": 0.715659869180454, + "flos": 602524127232.0, + "grad_norm": 0.0719921548875284, + "language_loss": 0.86400878, + "learning_rate": 0.0001974556556443734, + "loss": 0.87472844, + "num_input_tokens_seen": 308501888, + "router_z_loss_mlp": 0.15063477, + "step": 3720, + "time_per_iteration": 2.7185661792755127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071639, + "balance_loss_mlp": 1.05692816, + "epoch": 0.7158522508657176, + "flos": 531675684864.0, + "grad_norm": 0.10794401503038722, + "language_loss": 0.88869536, + "learning_rate": 0.00019720767746402547, + "loss": 0.89941168, + "num_input_tokens_seen": 308576368, + "router_z_loss_mlp": 0.14685059, + "step": 3721, + "time_per_iteration": 2.7171661853790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077737, + "balance_loss_mlp": 1.06312251, + "epoch": 0.7160446325509812, + "flos": 557569972224.0, + "grad_norm": 0.06715510904090914, + "language_loss": 0.7994473, + "learning_rate": 0.00019695981683808222, + "loss": 0.81022465, + "num_input_tokens_seen": 308651936, + "router_z_loss_mlp": 0.14599609, + "step": 3722, + "time_per_iteration": 2.764094114303589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076539, + "balance_loss_mlp": 1.06211424, + "epoch": 0.7162370142362448, + "flos": 690986847744.0, + "grad_norm": 0.0719125731951098, + "language_loss": 0.84857029, + "learning_rate": 0.00019671207386277225, + "loss": 0.85933566, + "num_input_tokens_seen": 308737264, + "router_z_loss_mlp": 0.14404297, + "step": 3723, + "time_per_iteration": 3.001659870147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079748, + "balance_loss_mlp": 1.06515729, + "epoch": 0.7164293959215082, + "flos": 794109173760.0, + "grad_norm": 0.06669181204662188, + "language_loss": 0.78279907, + "learning_rate": 0.0001964644486342777, + "loss": 0.79359657, + "num_input_tokens_seen": 308811776, + "router_z_loss_mlp": 0.14575195, + "step": 3724, + "time_per_iteration": 2.9778544902801514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081993, + "balance_loss_mlp": 1.06744969, + "epoch": 0.7166217776067718, + "flos": 494178527232.0, + "grad_norm": 0.0857275082292459, + "language_loss": 0.862409, + "learning_rate": 0.00019621694124873524, + "loss": 0.87322897, + "num_input_tokens_seen": 308886704, + "router_z_loss_mlp": 0.14526367, + "step": 3725, + "time_per_iteration": 2.704180955886841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019771, + "balance_loss_mlp": 1.0125227, + "epoch": 0.7168141592920354, + "flos": 1401060354048.0, + "grad_norm": 0.010100997712727341, + "language_loss": 0.76540077, + "learning_rate": 0.00019596955180223557, + "loss": 0.77559853, + "num_input_tokens_seen": 309113456, + "router_z_loss_mlp": 0.07226562, + "step": 3726, + "time_per_iteration": 4.889356374740601 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081675, + "balance_loss_mlp": 1.06740522, + "epoch": 0.717006540977299, + "flos": 793150341120.0, + "grad_norm": 0.06067860958569485, + "language_loss": 0.77179575, + "learning_rate": 0.00019572228039082428, + "loss": 0.7826125, + "num_input_tokens_seen": 309198768, + "router_z_loss_mlp": 0.14257812, + "step": 3727, + "time_per_iteration": 3.0806260108947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086686, + "balance_loss_mlp": 1.07246482, + "epoch": 0.7171989226625626, + "flos": 554812416000.0, + "grad_norm": 0.11517752889227628, + "language_loss": 0.83454174, + "learning_rate": 0.0001954751271105002, + "loss": 0.84540862, + "num_input_tokens_seen": 309279680, + "router_z_loss_mlp": 0.14221191, + "step": 3728, + "time_per_iteration": 2.8201711177825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090041, + "balance_loss_mlp": 1.07510376, + "epoch": 0.717391304347826, + "flos": 555914409984.0, + "grad_norm": 0.0783907674353494, + "language_loss": 0.80835211, + "learning_rate": 0.00019522809205721687, + "loss": 0.81925255, + "num_input_tokens_seen": 309359152, + "router_z_loss_mlp": 0.14904785, + "step": 3729, + "time_per_iteration": 2.7735860347747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086368, + "balance_loss_mlp": 1.07193196, + "epoch": 0.7175836860330896, + "flos": 538855898112.0, + "grad_norm": 0.0782422692062248, + "language_loss": 0.82922757, + "learning_rate": 0.0001949811753268816, + "loss": 0.84009123, + "num_input_tokens_seen": 309432800, + "router_z_loss_mlp": 0.14428711, + "step": 3730, + "time_per_iteration": 2.7340402603149414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085427, + "balance_loss_mlp": 1.07047808, + "epoch": 0.7177760677183532, + "flos": 515637674496.0, + "grad_norm": 0.07822041527126099, + "language_loss": 0.82415104, + "learning_rate": 0.00019473437701535634, + "loss": 0.83500528, + "num_input_tokens_seen": 309499456, + "router_z_loss_mlp": 0.14929199, + "step": 3731, + "time_per_iteration": 2.6087753772735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077018, + "balance_loss_mlp": 1.06260514, + "epoch": 0.7179684494036168, + "flos": 674719041024.0, + "grad_norm": 0.09315520299322393, + "language_loss": 0.89131868, + "learning_rate": 0.00019448769721845677, + "loss": 0.90208888, + "num_input_tokens_seen": 309571056, + "router_z_loss_mlp": 0.1439209, + "step": 3732, + "time_per_iteration": 2.836510419845581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077686, + "balance_loss_mlp": 1.06293976, + "epoch": 0.7181608310888803, + "flos": 469912637952.0, + "grad_norm": 0.09025148051517691, + "language_loss": 0.85745353, + "learning_rate": 0.00019424113603195203, + "loss": 0.86823046, + "num_input_tokens_seen": 309635040, + "router_z_loss_mlp": 0.1472168, + "step": 3733, + "time_per_iteration": 2.562520742416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079742, + "balance_loss_mlp": 1.06485271, + "epoch": 0.7183532127741439, + "flos": 593952652800.0, + "grad_norm": 0.07835269792198636, + "language_loss": 0.80024004, + "learning_rate": 0.0001939946935515657, + "loss": 0.81103742, + "num_input_tokens_seen": 309713696, + "router_z_loss_mlp": 0.14868164, + "step": 3734, + "time_per_iteration": 2.836775302886963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107287, + "balance_loss_mlp": 1.05774188, + "epoch": 0.7185455944594075, + "flos": 498917615616.0, + "grad_norm": 0.12420836308345841, + "language_loss": 0.80785656, + "learning_rate": 0.0001937483698729755, + "loss": 0.81858528, + "num_input_tokens_seen": 309782864, + "router_z_loss_mlp": 0.15100098, + "step": 3735, + "time_per_iteration": 2.63600492477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071914, + "balance_loss_mlp": 1.05714417, + "epoch": 0.718737976144671, + "flos": 814933260288.0, + "grad_norm": 0.06842150185792192, + "language_loss": 0.82507128, + "learning_rate": 0.0001935021650918128, + "loss": 0.8357904, + "num_input_tokens_seen": 309867056, + "router_z_loss_mlp": 0.14758301, + "step": 3736, + "time_per_iteration": 3.00943922996521 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068199, + "balance_loss_mlp": 1.0535481, + "epoch": 0.7189303578299346, + "flos": 438328143360.0, + "grad_norm": 0.07910633337871513, + "language_loss": 0.86689806, + "learning_rate": 0.0001932560793036625, + "loss": 0.87758005, + "num_input_tokens_seen": 309929744, + "router_z_loss_mlp": 0.14624023, + "step": 3737, + "time_per_iteration": 2.5100209712982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071415, + "balance_loss_mlp": 1.05637121, + "epoch": 0.7191227395151981, + "flos": 549398992896.0, + "grad_norm": 0.07360308333676036, + "language_loss": 0.86295319, + "learning_rate": 0.00019301011260406382, + "loss": 0.87366736, + "num_input_tokens_seen": 309998128, + "router_z_loss_mlp": 0.15014648, + "step": 3738, + "time_per_iteration": 2.6612305641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066115, + "balance_loss_mlp": 1.05066597, + "epoch": 0.7193151212004617, + "flos": 626938320384.0, + "grad_norm": 0.06504656569076563, + "language_loss": 0.79763281, + "learning_rate": 0.00019276426508850936, + "loss": 0.80829394, + "num_input_tokens_seen": 310065472, + "router_z_loss_mlp": 0.15429688, + "step": 3739, + "time_per_iteration": 2.7507288455963135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068994, + "balance_loss_mlp": 1.05356801, + "epoch": 0.7195075028857253, + "flos": 741062960640.0, + "grad_norm": 0.14081168877709307, + "language_loss": 0.80209506, + "learning_rate": 0.00019251853685244564, + "loss": 0.81278491, + "num_input_tokens_seen": 310152960, + "router_z_loss_mlp": 0.15405273, + "step": 3740, + "time_per_iteration": 3.0117878913879395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066123, + "balance_loss_mlp": 1.05085278, + "epoch": 0.7196998845709889, + "flos": 802875566592.0, + "grad_norm": 0.09880671887971038, + "language_loss": 0.80556595, + "learning_rate": 0.00019227292799127283, + "loss": 0.8162272, + "num_input_tokens_seen": 310234080, + "router_z_loss_mlp": 0.15258789, + "step": 3741, + "time_per_iteration": 3.026409864425659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064138, + "balance_loss_mlp": 1.04933214, + "epoch": 0.7198922662562524, + "flos": 925183669248.0, + "grad_norm": 0.07716038295803591, + "language_loss": 0.79115927, + "learning_rate": 0.00019202743860034454, + "loss": 0.80180067, + "num_input_tokens_seen": 310330208, + "router_z_loss_mlp": 0.14770508, + "step": 3742, + "time_per_iteration": 3.2409439086914062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_mlp": 1.04710603, + "epoch": 0.7200846479415159, + "flos": 580111289856.0, + "grad_norm": 0.0865048699099666, + "language_loss": 0.8386541, + "learning_rate": 0.00019178206877496873, + "loss": 0.84927607, + "num_input_tokens_seen": 310402960, + "router_z_loss_mlp": 0.15075684, + "step": 3743, + "time_per_iteration": 2.7031853199005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065388, + "balance_loss_mlp": 1.05033231, + "epoch": 0.7202770296267795, + "flos": 557695881216.0, + "grad_norm": 0.06660391987267253, + "language_loss": 0.85197371, + "learning_rate": 0.0001915368186104059, + "loss": 0.86262763, + "num_input_tokens_seen": 310479776, + "router_z_loss_mlp": 0.15026855, + "step": 3744, + "time_per_iteration": 2.80344557762146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067637, + "balance_loss_mlp": 1.05193746, + "epoch": 0.7204694113120431, + "flos": 672552129024.0, + "grad_norm": 0.07605590282722621, + "language_loss": 0.8109616, + "learning_rate": 0.0001912916882018706, + "loss": 0.82163799, + "num_input_tokens_seen": 310555952, + "router_z_loss_mlp": 0.15698242, + "step": 3745, + "time_per_iteration": 2.8043081760406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073912, + "balance_loss_mlp": 1.05809283, + "epoch": 0.7206617929973067, + "flos": 799194055680.0, + "grad_norm": 0.09426618368019588, + "language_loss": 0.79127324, + "learning_rate": 0.00019104667764453125, + "loss": 0.80201232, + "num_input_tokens_seen": 310634784, + "router_z_loss_mlp": 0.15808105, + "step": 3746, + "time_per_iteration": 3.0704562664031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067683, + "balance_loss_mlp": 1.0524838, + "epoch": 0.7208541746825702, + "flos": 531898140672.0, + "grad_norm": 0.06643820747478134, + "language_loss": 0.8021549, + "learning_rate": 0.00019080178703350926, + "loss": 0.81283176, + "num_input_tokens_seen": 310703216, + "router_z_loss_mlp": 0.1517334, + "step": 3747, + "time_per_iteration": 2.68495774269104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068891, + "balance_loss_mlp": 1.05280995, + "epoch": 0.7210465563678338, + "flos": 535139882496.0, + "grad_norm": 0.0742282179981503, + "language_loss": 0.8279261, + "learning_rate": 0.00019055701646387952, + "loss": 0.838615, + "num_input_tokens_seen": 310776816, + "router_z_loss_mlp": 0.1607666, + "step": 3748, + "time_per_iteration": 2.6640145778656006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028096, + "balance_loss_mlp": 1.02113438, + "epoch": 0.7212389380530974, + "flos": 1533908606976.0, + "grad_norm": 0.025188249902834022, + "language_loss": 0.80472684, + "learning_rate": 0.00019031236603067042, + "loss": 0.81500781, + "num_input_tokens_seen": 310987056, + "router_z_loss_mlp": 0.06982422, + "step": 3749, + "time_per_iteration": 4.826270341873169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066097, + "balance_loss_mlp": 1.05093408, + "epoch": 0.7214313197383609, + "flos": 461511862272.0, + "grad_norm": 0.08049000033963269, + "language_loss": 0.86480904, + "learning_rate": 0.00019006783582886368, + "loss": 0.87546998, + "num_input_tokens_seen": 311051648, + "router_z_loss_mlp": 0.15136719, + "step": 3750, + "time_per_iteration": 2.621736526489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067451, + "balance_loss_mlp": 1.05215693, + "epoch": 0.7216237014236244, + "flos": 1037134056960.0, + "grad_norm": 0.08524695909851505, + "language_loss": 0.82916629, + "learning_rate": 0.00018982342595339437, + "loss": 0.83984083, + "num_input_tokens_seen": 311146272, + "router_z_loss_mlp": 0.15270996, + "step": 3751, + "time_per_iteration": 3.483065128326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070357, + "balance_loss_mlp": 1.05556357, + "epoch": 0.721816083108888, + "flos": 895951466496.0, + "grad_norm": 0.06727789695466473, + "language_loss": 0.82144976, + "learning_rate": 0.00018957913649915076, + "loss": 0.83215332, + "num_input_tokens_seen": 311223760, + "router_z_loss_mlp": 0.14770508, + "step": 3752, + "time_per_iteration": 3.1644365787506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072387, + "balance_loss_mlp": 1.05725896, + "epoch": 0.7220084647941516, + "flos": 523314556416.0, + "grad_norm": 0.07729245448911205, + "language_loss": 0.79620636, + "learning_rate": 0.00018933496756097428, + "loss": 0.80693024, + "num_input_tokens_seen": 311290336, + "router_z_loss_mlp": 0.15100098, + "step": 3753, + "time_per_iteration": 2.620807409286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072735, + "balance_loss_mlp": 1.05732155, + "epoch": 0.7222008464794152, + "flos": 816099494400.0, + "grad_norm": 0.5805538149813421, + "language_loss": 0.81562132, + "learning_rate": 0.0001890909192336603, + "loss": 0.8263486, + "num_input_tokens_seen": 311366240, + "router_z_loss_mlp": 0.15393066, + "step": 3754, + "time_per_iteration": 3.042017936706543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078212, + "balance_loss_mlp": 1.06300032, + "epoch": 0.7223932281646788, + "flos": 749053702656.0, + "grad_norm": 0.0713648645371922, + "language_loss": 0.70115459, + "learning_rate": 0.00018884699161195623, + "loss": 0.71193671, + "num_input_tokens_seen": 311445184, + "router_z_loss_mlp": 0.15185547, + "step": 3755, + "time_per_iteration": 2.9720511436462402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076125, + "balance_loss_mlp": 1.06104493, + "epoch": 0.7225856098499422, + "flos": 745502870016.0, + "grad_norm": 0.09493040514567173, + "language_loss": 0.77216029, + "learning_rate": 0.00018860318479056327, + "loss": 0.78292155, + "num_input_tokens_seen": 311527280, + "router_z_loss_mlp": 0.15075684, + "step": 3756, + "time_per_iteration": 3.119727373123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083813, + "balance_loss_mlp": 1.06946039, + "epoch": 0.7227779915352058, + "flos": 547330825728.0, + "grad_norm": 0.0825815003753041, + "language_loss": 0.83252132, + "learning_rate": 0.00018835949886413555, + "loss": 0.84335947, + "num_input_tokens_seen": 311601552, + "router_z_loss_mlp": 0.14343262, + "step": 3757, + "time_per_iteration": 2.767611026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080172, + "balance_loss_mlp": 1.06541348, + "epoch": 0.7229703732204694, + "flos": 530484857856.0, + "grad_norm": 0.07604080274562658, + "language_loss": 0.7847476, + "learning_rate": 0.0001881159339272806, + "loss": 0.79554933, + "num_input_tokens_seen": 311670736, + "router_z_loss_mlp": 0.14733887, + "step": 3758, + "time_per_iteration": 2.644622325897217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086486, + "balance_loss_mlp": 1.07175171, + "epoch": 0.723162754905733, + "flos": 528355021824.0, + "grad_norm": 0.07134654052906102, + "language_loss": 0.78514063, + "learning_rate": 0.00018787249007455858, + "loss": 0.79600549, + "num_input_tokens_seen": 311736800, + "router_z_loss_mlp": 0.14709473, + "step": 3759, + "time_per_iteration": 2.613767147064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089448, + "balance_loss_mlp": 1.07513046, + "epoch": 0.7233551365909965, + "flos": 654868468224.0, + "grad_norm": 0.07096105030949329, + "language_loss": 0.71290004, + "learning_rate": 0.00018762916740048302, + "loss": 0.72379452, + "num_input_tokens_seen": 311806064, + "router_z_loss_mlp": 0.14318848, + "step": 3760, + "time_per_iteration": 2.822312355041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010919, + "balance_loss_mlp": 1.07746363, + "epoch": 0.7235475182762601, + "flos": 522365635584.0, + "grad_norm": 0.060444894943140336, + "language_loss": 0.85770047, + "learning_rate": 0.0001873859659995195, + "loss": 0.86861944, + "num_input_tokens_seen": 311881280, + "router_z_loss_mlp": 0.14428711, + "step": 3761, + "time_per_iteration": 2.7546091079711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096949, + "balance_loss_mlp": 1.08265626, + "epoch": 0.7237398999615237, + "flos": 609170595840.0, + "grad_norm": 0.0683412355594852, + "language_loss": 0.83724195, + "learning_rate": 0.0001871428859660878, + "loss": 0.84821141, + "num_input_tokens_seen": 311953696, + "router_z_loss_mlp": 0.14282227, + "step": 3762, + "time_per_iteration": 2.770059823989868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099122, + "balance_loss_mlp": 1.08488798, + "epoch": 0.7239322816467872, + "flos": 658987176960.0, + "grad_norm": 0.08191796316314504, + "language_loss": 0.82060403, + "learning_rate": 0.00018689992739455975, + "loss": 0.8315953, + "num_input_tokens_seen": 312032752, + "router_z_loss_mlp": 0.14233398, + "step": 3763, + "time_per_iteration": 2.929271697998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092957, + "balance_loss_mlp": 1.07871115, + "epoch": 0.7241246633320508, + "flos": 969282878976.0, + "grad_norm": 0.06346083155179776, + "language_loss": 0.85959136, + "learning_rate": 0.00018665709037926027, + "loss": 0.87052089, + "num_input_tokens_seen": 312120800, + "router_z_loss_mlp": 0.14257812, + "step": 3764, + "time_per_iteration": 3.3369805812835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099745, + "balance_loss_mlp": 1.08588123, + "epoch": 0.7243170450173143, + "flos": 514995273216.0, + "grad_norm": 0.08806284436028786, + "language_loss": 0.84687865, + "learning_rate": 0.00018641437501446694, + "loss": 0.85787606, + "num_input_tokens_seen": 312188416, + "router_z_loss_mlp": 0.13867188, + "step": 3765, + "time_per_iteration": 2.622209072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096599, + "balance_loss_mlp": 1.08235359, + "epoch": 0.7245094267025779, + "flos": 559746796032.0, + "grad_norm": 0.07635972593652277, + "language_loss": 0.82246089, + "learning_rate": 0.0001861717813944104, + "loss": 0.83342695, + "num_input_tokens_seen": 312257792, + "router_z_loss_mlp": 0.14257812, + "step": 3766, + "time_per_iteration": 2.6759207248687744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095414, + "balance_loss_mlp": 1.08095431, + "epoch": 0.7247018083878415, + "flos": 612642134016.0, + "grad_norm": 0.0797588387433463, + "language_loss": 0.79539496, + "learning_rate": 0.00018592930961327365, + "loss": 0.8063491, + "num_input_tokens_seen": 312328544, + "router_z_loss_mlp": 0.14440918, + "step": 3767, + "time_per_iteration": 2.7272777557373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109367, + "balance_loss_mlp": 1.07921004, + "epoch": 0.7248941900731051, + "flos": 634676871168.0, + "grad_norm": 0.06368751268419225, + "language_loss": 0.87997532, + "learning_rate": 0.00018568695976519273, + "loss": 0.89091206, + "num_input_tokens_seen": 312405888, + "router_z_loss_mlp": 0.14440918, + "step": 3768, + "time_per_iteration": 2.835996389389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094879, + "balance_loss_mlp": 1.07991815, + "epoch": 0.7250865717583687, + "flos": 424941230592.0, + "grad_norm": 0.07271677335378793, + "language_loss": 0.80159616, + "learning_rate": 0.00018544473194425593, + "loss": 0.81254494, + "num_input_tokens_seen": 312469552, + "router_z_loss_mlp": 0.14941406, + "step": 3769, + "time_per_iteration": 2.51243257522583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092608, + "balance_loss_mlp": 1.07782626, + "epoch": 0.7252789534436321, + "flos": 635114068992.0, + "grad_norm": 0.10799987095433689, + "language_loss": 0.78685284, + "learning_rate": 0.00018520262624450485, + "loss": 0.79777896, + "num_input_tokens_seen": 312548848, + "router_z_loss_mlp": 0.14770508, + "step": 3770, + "time_per_iteration": 2.936739444732666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095578, + "balance_loss_mlp": 1.08109403, + "epoch": 0.7254713351288957, + "flos": 617185930752.0, + "grad_norm": 0.05982613005726902, + "language_loss": 0.87150741, + "learning_rate": 0.00018496064275993324, + "loss": 0.88246322, + "num_input_tokens_seen": 312622016, + "router_z_loss_mlp": 0.14453125, + "step": 3771, + "time_per_iteration": 2.775094747543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087233, + "balance_loss_mlp": 1.07266569, + "epoch": 0.7256637168141593, + "flos": 766986983424.0, + "grad_norm": 0.07412314995641861, + "language_loss": 0.81699574, + "learning_rate": 0.00018471878158448686, + "loss": 0.82786798, + "num_input_tokens_seen": 312696960, + "router_z_loss_mlp": 0.14562988, + "step": 3772, + "time_per_iteration": 2.945774793624878 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093119, + "balance_loss_mlp": 1.07849216, + "epoch": 0.7258560984994229, + "flos": 495559503360.0, + "grad_norm": 0.0628089712415676, + "language_loss": 0.84061623, + "learning_rate": 0.00018447704281206512, + "loss": 0.85154736, + "num_input_tokens_seen": 312774352, + "router_z_loss_mlp": 0.14611816, + "step": 3773, + "time_per_iteration": 2.904330015182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085201, + "balance_loss_mlp": 1.07037139, + "epoch": 0.7260484801846864, + "flos": 530069681664.0, + "grad_norm": 0.06945926815964382, + "language_loss": 0.82613432, + "learning_rate": 0.0001842354265365191, + "loss": 0.83698636, + "num_input_tokens_seen": 312849600, + "router_z_loss_mlp": 0.14819336, + "step": 3774, + "time_per_iteration": 4.125708818435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089216, + "balance_loss_mlp": 1.07469606, + "epoch": 0.72624086186995, + "flos": 624964128768.0, + "grad_norm": 0.10416012988421754, + "language_loss": 0.80548131, + "learning_rate": 0.0001839939328516526, + "loss": 0.81637341, + "num_input_tokens_seen": 312922688, + "router_z_loss_mlp": 0.1451416, + "step": 3775, + "time_per_iteration": 2.750432014465332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086316, + "balance_loss_mlp": 1.07196307, + "epoch": 0.7264332435552135, + "flos": 716522858496.0, + "grad_norm": 0.07329543067618247, + "language_loss": 0.81326902, + "learning_rate": 0.0001837525618512218, + "loss": 0.8241322, + "num_input_tokens_seen": 312997728, + "router_z_loss_mlp": 0.14343262, + "step": 3776, + "time_per_iteration": 2.9147586822509766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090392, + "balance_loss_mlp": 1.07571745, + "epoch": 0.7266256252404771, + "flos": 681036968448.0, + "grad_norm": 0.09666492868524106, + "language_loss": 0.82873899, + "learning_rate": 0.00018351131362893519, + "loss": 0.83964288, + "num_input_tokens_seen": 313067168, + "router_z_loss_mlp": 0.14660645, + "step": 3777, + "time_per_iteration": 2.857516050338745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087388, + "balance_loss_mlp": 1.07228446, + "epoch": 0.7268180069257407, + "flos": 518906580480.0, + "grad_norm": 0.07721555161828438, + "language_loss": 0.80164421, + "learning_rate": 0.00018327018827845364, + "loss": 0.81251806, + "num_input_tokens_seen": 313134688, + "router_z_loss_mlp": 0.15087891, + "step": 3778, + "time_per_iteration": 2.6123135089874268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088784, + "balance_loss_mlp": 1.07418132, + "epoch": 0.7270103886110042, + "flos": 512662804992.0, + "grad_norm": 0.07034168879093446, + "language_loss": 0.87450492, + "learning_rate": 0.00018302918589339036, + "loss": 0.88539279, + "num_input_tokens_seen": 313204816, + "router_z_loss_mlp": 0.14599609, + "step": 3779, + "time_per_iteration": 2.635622024536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089717, + "balance_loss_mlp": 1.07491088, + "epoch": 0.7272027702962678, + "flos": 546653919744.0, + "grad_norm": 0.06972150146327356, + "language_loss": 0.89592332, + "learning_rate": 0.00018278830656731054, + "loss": 0.90682048, + "num_input_tokens_seen": 313274288, + "router_z_loss_mlp": 0.14782715, + "step": 3780, + "time_per_iteration": 2.7083652019500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089508, + "balance_loss_mlp": 1.07489288, + "epoch": 0.7273951519815314, + "flos": 593048521728.0, + "grad_norm": 0.06413088918565218, + "language_loss": 0.86338055, + "learning_rate": 0.00018254755039373222, + "loss": 0.87427557, + "num_input_tokens_seen": 313344800, + "router_z_loss_mlp": 0.14599609, + "step": 3781, + "time_per_iteration": 2.746243953704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083719, + "balance_loss_mlp": 1.06884193, + "epoch": 0.727587533666795, + "flos": 606012917760.0, + "grad_norm": 0.07626368504613235, + "language_loss": 0.83212483, + "learning_rate": 0.0001823069174661252, + "loss": 0.84296203, + "num_input_tokens_seen": 313417840, + "router_z_loss_mlp": 0.14855957, + "step": 3782, + "time_per_iteration": 2.8131794929504395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076535, + "balance_loss_mlp": 1.06205118, + "epoch": 0.7277799153520584, + "flos": 513021081600.0, + "grad_norm": 0.06295680687034302, + "language_loss": 0.78633702, + "learning_rate": 0.00018206640787791112, + "loss": 0.79710239, + "num_input_tokens_seen": 313485936, + "router_z_loss_mlp": 0.14453125, + "step": 3783, + "time_per_iteration": 2.6886956691741943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085999, + "balance_loss_mlp": 1.07144356, + "epoch": 0.727972297037322, + "flos": 537756475392.0, + "grad_norm": 0.06647490190453816, + "language_loss": 0.85873067, + "learning_rate": 0.00018182602172246416, + "loss": 0.86959064, + "num_input_tokens_seen": 313553136, + "router_z_loss_mlp": 0.14575195, + "step": 3784, + "time_per_iteration": 2.6511552333831787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086575, + "balance_loss_mlp": 1.07187629, + "epoch": 0.7281646787225856, + "flos": 535038566400.0, + "grad_norm": 0.08017450109012514, + "language_loss": 0.76435304, + "learning_rate": 0.00018158575909311075, + "loss": 0.77521873, + "num_input_tokens_seen": 313620128, + "router_z_loss_mlp": 0.14685059, + "step": 3785, + "time_per_iteration": 2.681915044784546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084547, + "balance_loss_mlp": 1.06974173, + "epoch": 0.7283570604078492, + "flos": 625055533056.0, + "grad_norm": 0.08921915239194265, + "language_loss": 0.79687071, + "learning_rate": 0.000181345620083129, + "loss": 0.80771625, + "num_input_tokens_seen": 313696432, + "router_z_loss_mlp": 0.14794922, + "step": 3786, + "time_per_iteration": 2.7836999893188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079267, + "balance_loss_mlp": 1.06438935, + "epoch": 0.7285494420931128, + "flos": 534173709312.0, + "grad_norm": 0.06165566569921882, + "language_loss": 0.86873049, + "learning_rate": 0.00018110560478574927, + "loss": 0.8795231, + "num_input_tokens_seen": 313768416, + "router_z_loss_mlp": 0.14855957, + "step": 3787, + "time_per_iteration": 2.7312710285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076321, + "balance_loss_mlp": 1.06149101, + "epoch": 0.7287418237783763, + "flos": 666548061696.0, + "grad_norm": 0.08287585285923407, + "language_loss": 0.80037522, + "learning_rate": 0.0001808657132941533, + "loss": 0.81113839, + "num_input_tokens_seen": 313839888, + "router_z_loss_mlp": 0.14807129, + "step": 3788, + "time_per_iteration": 2.8081939220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076757, + "balance_loss_mlp": 1.06220174, + "epoch": 0.7289342054636399, + "flos": 550602302976.0, + "grad_norm": 0.07558714577930627, + "language_loss": 0.83176941, + "learning_rate": 0.00018062594570147572, + "loss": 0.84253705, + "num_input_tokens_seen": 313908832, + "router_z_loss_mlp": 0.14562988, + "step": 3789, + "time_per_iteration": 2.6432437896728516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083922, + "balance_loss_mlp": 1.0693903, + "epoch": 0.7291265871489034, + "flos": 687923145216.0, + "grad_norm": 0.07287349228687776, + "language_loss": 0.84963691, + "learning_rate": 0.00018038630210080243, + "loss": 0.86047614, + "num_input_tokens_seen": 313982672, + "router_z_loss_mlp": 0.14526367, + "step": 3790, + "time_per_iteration": 2.865356683731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073151, + "balance_loss_mlp": 1.05852365, + "epoch": 0.729318968834167, + "flos": 572664204288.0, + "grad_norm": 0.07168736979899527, + "language_loss": 0.8499006, + "learning_rate": 0.0001801467825851712, + "loss": 0.86063206, + "num_input_tokens_seen": 314057184, + "router_z_loss_mlp": 0.14611816, + "step": 3791, + "time_per_iteration": 2.7372162342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073095, + "balance_loss_mlp": 1.05845594, + "epoch": 0.7295113505194305, + "flos": 586061028864.0, + "grad_norm": 0.07281056570289735, + "language_loss": 0.78196633, + "learning_rate": 0.00017990738724757172, + "loss": 0.79269731, + "num_input_tokens_seen": 314137344, + "router_z_loss_mlp": 0.14611816, + "step": 3792, + "time_per_iteration": 2.8774027824401855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107242, + "balance_loss_mlp": 1.05760276, + "epoch": 0.7297037322046941, + "flos": 707185645056.0, + "grad_norm": 0.06295411863995527, + "language_loss": 0.82293737, + "learning_rate": 0.00017966811618094598, + "loss": 0.83366162, + "num_input_tokens_seen": 314214464, + "router_z_loss_mlp": 0.14794922, + "step": 3793, + "time_per_iteration": 2.945582151412964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074316, + "balance_loss_mlp": 1.05921233, + "epoch": 0.7298961138899577, + "flos": 487292350464.0, + "grad_norm": 0.08262020885938813, + "language_loss": 0.8475967, + "learning_rate": 0.00017942896947818664, + "loss": 0.85833991, + "num_input_tokens_seen": 314280432, + "router_z_loss_mlp": 0.15075684, + "step": 3794, + "time_per_iteration": 2.6285526752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_mlp": 1.01879299, + "epoch": 0.7300884955752213, + "flos": 1365804260352.0, + "grad_norm": 0.019285645442211487, + "language_loss": 0.74825054, + "learning_rate": 0.000179189947232139, + "loss": 0.75850904, + "num_input_tokens_seen": 314497152, + "router_z_loss_mlp": 0.07080078, + "step": 3795, + "time_per_iteration": 4.929250478744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072587, + "balance_loss_mlp": 1.05812693, + "epoch": 0.7302808772604849, + "flos": 531806736384.0, + "grad_norm": 0.09431722804853598, + "language_loss": 0.85334897, + "learning_rate": 0.00017895104953559947, + "loss": 0.86407483, + "num_input_tokens_seen": 314565488, + "router_z_loss_mlp": 0.14453125, + "step": 3796, + "time_per_iteration": 2.605687141418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082695, + "balance_loss_mlp": 1.06815124, + "epoch": 0.7304732589457483, + "flos": 436171143168.0, + "grad_norm": 0.08633113944613344, + "language_loss": 0.89526945, + "learning_rate": 0.00017871227648131672, + "loss": 0.9060964, + "num_input_tokens_seen": 314627392, + "router_z_loss_mlp": 0.1451416, + "step": 3797, + "time_per_iteration": 2.521352767944336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072295, + "balance_loss_mlp": 1.05728662, + "epoch": 0.7306656406310119, + "flos": 451621080576.0, + "grad_norm": 0.06678098801503493, + "language_loss": 0.82943988, + "learning_rate": 0.0001784736281619907, + "loss": 0.84016287, + "num_input_tokens_seen": 314695440, + "router_z_loss_mlp": 0.14978027, + "step": 3798, + "time_per_iteration": 2.609084129333496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074305, + "balance_loss_mlp": 1.05968988, + "epoch": 0.7308580223162755, + "flos": 512010491904.0, + "grad_norm": 0.0786239518455689, + "language_loss": 0.74484026, + "learning_rate": 0.00017823510467027232, + "loss": 0.75558329, + "num_input_tokens_seen": 314772592, + "router_z_loss_mlp": 0.14599609, + "step": 3799, + "time_per_iteration": 2.8423922061920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067282, + "balance_loss_mlp": 1.05283403, + "epoch": 0.7310504040015391, + "flos": 375423455232.0, + "grad_norm": 0.07912584621582001, + "language_loss": 0.78262001, + "learning_rate": 0.00017799670609876516, + "loss": 0.79329282, + "num_input_tokens_seen": 314836192, + "router_z_loss_mlp": 0.14477539, + "step": 3800, + "time_per_iteration": 2.535236120223999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071664, + "balance_loss_mlp": 1.05734682, + "epoch": 0.7312427856868026, + "flos": 549334752768.0, + "grad_norm": 0.06546690696594622, + "language_loss": 0.88949418, + "learning_rate": 0.00017775843254002366, + "loss": 0.90021086, + "num_input_tokens_seen": 314908400, + "router_z_loss_mlp": 0.14306641, + "step": 3801, + "time_per_iteration": 2.7845892906188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075479, + "balance_loss_mlp": 1.06116223, + "epoch": 0.7314351673720662, + "flos": 767238801408.0, + "grad_norm": 0.06442177991273089, + "language_loss": 0.83698308, + "learning_rate": 0.00017752028408655367, + "loss": 0.84773785, + "num_input_tokens_seen": 314995280, + "router_z_loss_mlp": 0.14306641, + "step": 3802, + "time_per_iteration": 3.0654079914093018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073558, + "balance_loss_mlp": 1.05856121, + "epoch": 0.7316275490573297, + "flos": 486734012928.0, + "grad_norm": 0.177225948577802, + "language_loss": 0.85229474, + "learning_rate": 0.00017728226083081272, + "loss": 0.86303031, + "num_input_tokens_seen": 315063056, + "router_z_loss_mlp": 0.14978027, + "step": 3803, + "time_per_iteration": 2.5718350410461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074242, + "balance_loss_mlp": 1.05967474, + "epoch": 0.7318199307425933, + "flos": 473428592640.0, + "grad_norm": 0.08565568804066387, + "language_loss": 0.81454623, + "learning_rate": 0.00017704436286520965, + "loss": 0.82528865, + "num_input_tokens_seen": 315128896, + "router_z_loss_mlp": 0.14538574, + "step": 3804, + "time_per_iteration": 2.6038320064544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106992, + "balance_loss_mlp": 1.05504251, + "epoch": 0.7320123124278569, + "flos": 549463233024.0, + "grad_norm": 0.12360179299397371, + "language_loss": 0.8468073, + "learning_rate": 0.0001768065902821046, + "loss": 0.85750651, + "num_input_tokens_seen": 315198464, + "router_z_loss_mlp": 0.1484375, + "step": 3805, + "time_per_iteration": 2.651048183441162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071694, + "balance_loss_mlp": 1.05726933, + "epoch": 0.7322046941131204, + "flos": 570781416960.0, + "grad_norm": 0.07802569861836066, + "language_loss": 0.82316971, + "learning_rate": 0.00017656894317380907, + "loss": 0.83388662, + "num_input_tokens_seen": 315270240, + "router_z_loss_mlp": 0.14416504, + "step": 3806, + "time_per_iteration": 2.756763219833374 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014485, + "balance_loss_mlp": 1.00723755, + "epoch": 0.732397075798384, + "flos": 1469165548032.0, + "grad_norm": 0.009270144136788097, + "language_loss": 0.76031268, + "learning_rate": 0.00017633142163258565, + "loss": 0.77045751, + "num_input_tokens_seen": 315502448, + "router_z_loss_mlp": 0.07226562, + "step": 3807, + "time_per_iteration": 5.025331735610962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075041, + "balance_loss_mlp": 1.06017613, + "epoch": 0.7325894574836476, + "flos": 464862260736.0, + "grad_norm": 0.08110176134528321, + "language_loss": 0.83730799, + "learning_rate": 0.00017609402575064875, + "loss": 0.8480584, + "num_input_tokens_seen": 315569472, + "router_z_loss_mlp": 0.1484375, + "step": 3808, + "time_per_iteration": 2.5651097297668457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080989, + "balance_loss_mlp": 1.06662416, + "epoch": 0.7327818391689112, + "flos": 495493065216.0, + "grad_norm": 0.07932211737712976, + "language_loss": 0.81102324, + "learning_rate": 0.00017585675562016367, + "loss": 0.82183307, + "num_input_tokens_seen": 315637632, + "router_z_loss_mlp": 0.14355469, + "step": 3809, + "time_per_iteration": 2.6230361461639404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016113, + "balance_loss_mlp": 1.00881767, + "epoch": 0.7329742208541746, + "flos": 1433489508864.0, + "grad_norm": 0.01295473198731384, + "language_loss": 0.77212846, + "learning_rate": 0.0001756196113332465, + "loss": 0.78228962, + "num_input_tokens_seen": 315863648, + "router_z_loss_mlp": 0.07275391, + "step": 3810, + "time_per_iteration": 4.819159746170044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081965, + "balance_loss_mlp": 1.06713569, + "epoch": 0.7331666025394382, + "flos": 496889095680.0, + "grad_norm": 0.07185927058157819, + "language_loss": 0.8484388, + "learning_rate": 0.00017538259298196474, + "loss": 0.85925841, + "num_input_tokens_seen": 315930752, + "router_z_loss_mlp": 0.14819336, + "step": 3811, + "time_per_iteration": 2.5887067317962646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079969, + "balance_loss_mlp": 1.06556845, + "epoch": 0.7333589842247018, + "flos": 538524785664.0, + "grad_norm": 0.0628616136872852, + "language_loss": 0.81993341, + "learning_rate": 0.00017514570065833745, + "loss": 0.83073318, + "num_input_tokens_seen": 316006400, + "router_z_loss_mlp": 0.14379883, + "step": 3812, + "time_per_iteration": 2.7150583267211914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082723, + "balance_loss_mlp": 1.06795263, + "epoch": 0.7335513659099654, + "flos": 491067836928.0, + "grad_norm": 0.08278360701185013, + "language_loss": 0.80552948, + "learning_rate": 0.00017490893445433426, + "loss": 0.81635672, + "num_input_tokens_seen": 316075824, + "router_z_loss_mlp": 0.14746094, + "step": 3813, + "time_per_iteration": 2.595290422439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080582, + "balance_loss_mlp": 1.0658834, + "epoch": 0.733743747595229, + "flos": 562150844928.0, + "grad_norm": 0.06487588714867228, + "language_loss": 0.81347382, + "learning_rate": 0.00017467229446187587, + "loss": 0.82427955, + "num_input_tokens_seen": 316148336, + "router_z_loss_mlp": 0.14709473, + "step": 3814, + "time_per_iteration": 2.7173616886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089973, + "balance_loss_mlp": 1.07540596, + "epoch": 0.7339361292804925, + "flos": 538581685248.0, + "grad_norm": 0.08798326338090434, + "language_loss": 0.81541699, + "learning_rate": 0.00017443578077283424, + "loss": 0.82631671, + "num_input_tokens_seen": 316220960, + "router_z_loss_mlp": 0.14550781, + "step": 3815, + "time_per_iteration": 2.7105531692504883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087021, + "balance_loss_mlp": 1.07223916, + "epoch": 0.734128510965756, + "flos": 548469895680.0, + "grad_norm": 0.06566892057084078, + "language_loss": 0.84730685, + "learning_rate": 0.0001741993934790319, + "loss": 0.85817701, + "num_input_tokens_seen": 316295824, + "router_z_loss_mlp": 0.14770508, + "step": 3816, + "time_per_iteration": 2.77266001701355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107762, + "balance_loss_mlp": 1.06295753, + "epoch": 0.7343208926510196, + "flos": 540066548736.0, + "grad_norm": 0.09067152232159664, + "language_loss": 0.84255576, + "learning_rate": 0.00017396313267224273, + "loss": 0.85333198, + "num_input_tokens_seen": 316368064, + "router_z_loss_mlp": 0.14660645, + "step": 3817, + "time_per_iteration": 2.7289724349975586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082678, + "balance_loss_mlp": 1.06814599, + "epoch": 0.7345132743362832, + "flos": 571095277056.0, + "grad_norm": 0.07934793398680723, + "language_loss": 0.8837018, + "learning_rate": 0.0001737269984441912, + "loss": 0.89452857, + "num_input_tokens_seen": 316437440, + "router_z_loss_mlp": 0.14526367, + "step": 3818, + "time_per_iteration": 2.679121255874634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108093, + "balance_loss_mlp": 1.06629074, + "epoch": 0.7347056560215467, + "flos": 545403621888.0, + "grad_norm": 0.06604620451137376, + "language_loss": 0.85161746, + "learning_rate": 0.00017349099088655263, + "loss": 0.86242676, + "num_input_tokens_seen": 316511936, + "router_z_loss_mlp": 0.14624023, + "step": 3819, + "time_per_iteration": 2.7354836463928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107929, + "balance_loss_mlp": 1.06509197, + "epoch": 0.7348980377068103, + "flos": 595949239296.0, + "grad_norm": 0.06952246164346525, + "language_loss": 0.80691403, + "learning_rate": 0.00017325511009095375, + "loss": 0.81770694, + "num_input_tokens_seen": 316584304, + "router_z_loss_mlp": 0.14208984, + "step": 3820, + "time_per_iteration": 2.7548413276672363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072543, + "balance_loss_mlp": 1.05823815, + "epoch": 0.7350904193920739, + "flos": 538554521088.0, + "grad_norm": 0.06808643977119672, + "language_loss": 0.83516192, + "learning_rate": 0.00017301935614897113, + "loss": 0.8458873, + "num_input_tokens_seen": 316659024, + "router_z_loss_mlp": 0.14318848, + "step": 3821, + "time_per_iteration": 2.7016494274139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074862, + "balance_loss_mlp": 1.0602231, + "epoch": 0.7352828010773375, + "flos": 512981434368.0, + "grad_norm": 0.06002582073431783, + "language_loss": 0.8197211, + "learning_rate": 0.00017278372915213274, + "loss": 0.83046979, + "num_input_tokens_seen": 316732544, + "router_z_loss_mlp": 0.14624023, + "step": 3822, + "time_per_iteration": 2.6761066913604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01016556, + "balance_loss_mlp": 1.00921309, + "epoch": 0.735475182762601, + "flos": 1553820848640.0, + "grad_norm": 0.014100117797771941, + "language_loss": 0.79893845, + "learning_rate": 0.00017254822919191693, + "loss": 0.80910403, + "num_input_tokens_seen": 316967104, + "router_z_loss_mlp": 0.07324219, + "step": 3823, + "time_per_iteration": 5.009763956069946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074081, + "balance_loss_mlp": 1.05965686, + "epoch": 0.7356675644478645, + "flos": 681308610048.0, + "grad_norm": 0.08234273424412843, + "language_loss": 0.806014, + "learning_rate": 0.00017231285635975314, + "loss": 0.81675482, + "num_input_tokens_seen": 317048304, + "router_z_loss_mlp": 0.1439209, + "step": 3824, + "time_per_iteration": 2.9129364490509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069863, + "balance_loss_mlp": 1.05441332, + "epoch": 0.7358599461331281, + "flos": 515215157760.0, + "grad_norm": 0.08116369820319415, + "language_loss": 0.82920796, + "learning_rate": 0.00017207761074702115, + "loss": 0.83990657, + "num_input_tokens_seen": 317115968, + "router_z_loss_mlp": 0.1541748, + "step": 3825, + "time_per_iteration": 2.641977071762085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069473, + "balance_loss_mlp": 1.05479813, + "epoch": 0.7360523278183917, + "flos": 443973934080.0, + "grad_norm": 0.06363910261754813, + "language_loss": 0.83689082, + "learning_rate": 0.0001718424924450514, + "loss": 0.8475855, + "num_input_tokens_seen": 317185680, + "router_z_loss_mlp": 0.14660645, + "step": 3826, + "time_per_iteration": 2.6134989261627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106716, + "balance_loss_mlp": 1.05211544, + "epoch": 0.7362447095036553, + "flos": 603423489024.0, + "grad_norm": 0.06392814442784994, + "language_loss": 0.85810113, + "learning_rate": 0.00017160750154512482, + "loss": 0.86877275, + "num_input_tokens_seen": 317258800, + "router_z_loss_mlp": 0.15026855, + "step": 3827, + "time_per_iteration": 2.736499786376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071488, + "balance_loss_mlp": 1.05717158, + "epoch": 0.7364370911889189, + "flos": 553095184896.0, + "grad_norm": 0.060676486527101066, + "language_loss": 0.83347571, + "learning_rate": 0.0001713726381384731, + "loss": 0.8441906, + "num_input_tokens_seen": 317334608, + "router_z_loss_mlp": 0.14318848, + "step": 3828, + "time_per_iteration": 2.7891271114349365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067133, + "balance_loss_mlp": 1.05237508, + "epoch": 0.7366294728741823, + "flos": 449061387264.0, + "grad_norm": 0.07991922680329289, + "language_loss": 0.81341559, + "learning_rate": 0.00017113790231627812, + "loss": 0.8240869, + "num_input_tokens_seen": 317397504, + "router_z_loss_mlp": 0.14733887, + "step": 3829, + "time_per_iteration": 2.525070905685425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011404, + "balance_loss_mlp": 1.00415587, + "epoch": 0.7368218545594459, + "flos": 1535502500352.0, + "grad_norm": 0.013118834983913303, + "language_loss": 0.79258227, + "learning_rate": 0.0001709032941696726, + "loss": 0.80269623, + "num_input_tokens_seen": 317611472, + "router_z_loss_mlp": 0.07226562, + "step": 3830, + "time_per_iteration": 4.838621377944946 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106649, + "balance_loss_mlp": 1.05186284, + "epoch": 0.7370142362447095, + "flos": 515425130496.0, + "grad_norm": 0.07911932608285421, + "language_loss": 0.81592512, + "learning_rate": 0.00017066881378973936, + "loss": 0.82659006, + "num_input_tokens_seen": 317681328, + "router_z_loss_mlp": 0.14587402, + "step": 3831, + "time_per_iteration": 2.7149910926818848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106728, + "balance_loss_mlp": 1.0528084, + "epoch": 0.7372066179299731, + "flos": 500805172224.0, + "grad_norm": 0.06667618306638196, + "language_loss": 0.82793903, + "learning_rate": 0.00017043446126751189, + "loss": 0.8386119, + "num_input_tokens_seen": 317752336, + "router_z_loss_mlp": 0.14453125, + "step": 3832, + "time_per_iteration": 2.6927688121795654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106817, + "balance_loss_mlp": 1.0533402, + "epoch": 0.7373989996152366, + "flos": 558083893248.0, + "grad_norm": 0.07864183191565526, + "language_loss": 0.76522374, + "learning_rate": 0.00017020023669397376, + "loss": 0.77590549, + "num_input_tokens_seen": 317824112, + "router_z_loss_mlp": 0.14819336, + "step": 3833, + "time_per_iteration": 2.7058162689208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107006, + "balance_loss_mlp": 1.05456233, + "epoch": 0.7375913813005002, + "flos": 506777306112.0, + "grad_norm": 0.08760745702981601, + "language_loss": 0.81515223, + "learning_rate": 0.0001699661401600589, + "loss": 0.82585281, + "num_input_tokens_seen": 317889120, + "router_z_loss_mlp": 0.15478516, + "step": 3834, + "time_per_iteration": 2.6158528327941895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106653, + "balance_loss_mlp": 1.05206978, + "epoch": 0.7377837629857638, + "flos": 486183015936.0, + "grad_norm": 0.07963589333837205, + "language_loss": 0.78064704, + "learning_rate": 0.00016973217175665205, + "loss": 0.79131228, + "num_input_tokens_seen": 317953792, + "router_z_loss_mlp": 0.14453125, + "step": 3835, + "time_per_iteration": 2.6113386154174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005306, + "balance_loss_mlp": 0.99843931, + "epoch": 0.7379761446710273, + "flos": 1414693942272.0, + "grad_norm": 0.007558216579010849, + "language_loss": 0.8116616, + "learning_rate": 0.00016949833157458755, + "loss": 0.82171464, + "num_input_tokens_seen": 318184848, + "router_z_loss_mlp": 0.06884766, + "step": 3836, + "time_per_iteration": 4.930665493011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071082, + "balance_loss_mlp": 1.05599046, + "epoch": 0.7381685263562909, + "flos": 629737721856.0, + "grad_norm": 0.07838551299757777, + "language_loss": 0.84225684, + "learning_rate": 0.00016926461970465047, + "loss": 0.85296762, + "num_input_tokens_seen": 318259296, + "router_z_loss_mlp": 0.15063477, + "step": 3837, + "time_per_iteration": 2.7925772666931152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069523, + "balance_loss_mlp": 1.05480027, + "epoch": 0.7383609080415544, + "flos": 739224589824.0, + "grad_norm": 0.06636874651090781, + "language_loss": 0.84278762, + "learning_rate": 0.00016903103623757516, + "loss": 0.8534829, + "num_input_tokens_seen": 318344704, + "router_z_loss_mlp": 0.14709473, + "step": 3838, + "time_per_iteration": 3.077704429626465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_mlp": 1.0535419, + "epoch": 0.738553289726818, + "flos": 550206950400.0, + "grad_norm": 0.0837224725732271, + "language_loss": 0.79925728, + "learning_rate": 0.00016879758126404738, + "loss": 0.80994093, + "num_input_tokens_seen": 318416128, + "router_z_loss_mlp": 0.14819336, + "step": 3839, + "time_per_iteration": 2.7352871894836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072201, + "balance_loss_mlp": 1.05714512, + "epoch": 0.7387456714120816, + "flos": 910294640640.0, + "grad_norm": 0.07590823763574843, + "language_loss": 0.80038518, + "learning_rate": 0.00016856425487470216, + "loss": 0.81110722, + "num_input_tokens_seen": 318498128, + "router_z_loss_mlp": 0.15039062, + "step": 3840, + "time_per_iteration": 3.1132917404174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070306, + "balance_loss_mlp": 1.05497539, + "epoch": 0.7389380530973452, + "flos": 852684807168.0, + "grad_norm": 0.0859256588835734, + "language_loss": 0.78885496, + "learning_rate": 0.00016833105716012486, + "loss": 0.79955798, + "num_input_tokens_seen": 318578048, + "router_z_loss_mlp": 0.15307617, + "step": 3841, + "time_per_iteration": 3.1532208919525146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067671, + "balance_loss_mlp": 1.05297232, + "epoch": 0.7391304347826086, + "flos": 817026020352.0, + "grad_norm": 0.06792202219363284, + "language_loss": 0.84900254, + "learning_rate": 0.00016809798821085088, + "loss": 0.85967922, + "num_input_tokens_seen": 318654784, + "router_z_loss_mlp": 0.14660645, + "step": 3842, + "time_per_iteration": 3.01279354095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070418, + "balance_loss_mlp": 1.05537403, + "epoch": 0.7393228164678722, + "flos": 572819848704.0, + "grad_norm": 0.0638380683182141, + "language_loss": 0.88969815, + "learning_rate": 0.00016786504811736565, + "loss": 0.90040231, + "num_input_tokens_seen": 318727680, + "router_z_loss_mlp": 0.15014648, + "step": 3843, + "time_per_iteration": 2.6979498863220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071016, + "balance_loss_mlp": 1.05594802, + "epoch": 0.7395151981531358, + "flos": 685237169664.0, + "grad_norm": 0.061553978302081376, + "language_loss": 0.82327366, + "learning_rate": 0.00016763223697010442, + "loss": 0.83398378, + "num_input_tokens_seen": 318807568, + "router_z_loss_mlp": 0.1505127, + "step": 3844, + "time_per_iteration": 2.9502556324005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067572, + "balance_loss_mlp": 1.05302894, + "epoch": 0.7397075798383994, + "flos": 556366662144.0, + "grad_norm": 0.056403600780772105, + "language_loss": 0.84155715, + "learning_rate": 0.00016739955485945256, + "loss": 0.85223293, + "num_input_tokens_seen": 318881792, + "router_z_loss_mlp": 0.14538574, + "step": 3845, + "time_per_iteration": 2.7162668704986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070346, + "balance_loss_mlp": 1.05569506, + "epoch": 0.739899961523663, + "flos": 546782400000.0, + "grad_norm": 0.081576664955192, + "language_loss": 0.86097336, + "learning_rate": 0.00016716700187574513, + "loss": 0.87167686, + "num_input_tokens_seen": 318951552, + "router_z_loss_mlp": 0.14648438, + "step": 3846, + "time_per_iteration": 2.6993191242218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073439, + "balance_loss_mlp": 1.0585022, + "epoch": 0.7400923432089265, + "flos": 609190419456.0, + "grad_norm": 0.06966979530394013, + "language_loss": 0.83732522, + "learning_rate": 0.0001669345781092675, + "loss": 0.84805954, + "num_input_tokens_seen": 319022304, + "router_z_loss_mlp": 0.14916992, + "step": 3847, + "time_per_iteration": 2.770108699798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075946, + "balance_loss_mlp": 1.06135464, + "epoch": 0.7402847248941901, + "flos": 591007518720.0, + "grad_norm": 0.06701111666950413, + "language_loss": 0.8687951, + "learning_rate": 0.0001667022836502546, + "loss": 0.87955451, + "num_input_tokens_seen": 319093200, + "router_z_loss_mlp": 0.14587402, + "step": 3848, + "time_per_iteration": 2.7933013439178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075686, + "balance_loss_mlp": 1.06097555, + "epoch": 0.7404771065794536, + "flos": 477369635328.0, + "grad_norm": 0.10971052102255037, + "language_loss": 0.8283127, + "learning_rate": 0.00016647011858889077, + "loss": 0.8390696, + "num_input_tokens_seen": 319159712, + "router_z_loss_mlp": 0.14709473, + "step": 3849, + "time_per_iteration": 2.5821588039398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107146, + "balance_loss_mlp": 1.05672526, + "epoch": 0.7406694882647172, + "flos": 496446755328.0, + "grad_norm": 0.08016384906089048, + "language_loss": 0.86103845, + "learning_rate": 0.00016623808301531056, + "loss": 0.87175304, + "num_input_tokens_seen": 319230544, + "router_z_loss_mlp": 0.1472168, + "step": 3850, + "time_per_iteration": 2.6669719219207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071974, + "balance_loss_mlp": 1.05720425, + "epoch": 0.7408618699499807, + "flos": 562205173248.0, + "grad_norm": 0.08205354684217782, + "language_loss": 0.79157412, + "learning_rate": 0.00016600617701959842, + "loss": 0.80229384, + "num_input_tokens_seen": 319305440, + "router_z_loss_mlp": 0.14746094, + "step": 3851, + "time_per_iteration": 2.747596502304077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006701, + "balance_loss_mlp": 1.00007319, + "epoch": 0.7410542516352443, + "flos": 1388228834304.0, + "grad_norm": 0.0072472756451880905, + "language_loss": 0.78843814, + "learning_rate": 0.00016577440069178811, + "loss": 0.79850513, + "num_input_tokens_seen": 319534384, + "router_z_loss_mlp": 0.06640625, + "step": 3852, + "time_per_iteration": 5.075153350830078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074994, + "balance_loss_mlp": 1.06011701, + "epoch": 0.7412466333205079, + "flos": 669999776256.0, + "grad_norm": 0.07625474461693704, + "language_loss": 0.81200403, + "learning_rate": 0.00016554275412186315, + "loss": 0.82275391, + "num_input_tokens_seen": 319610960, + "router_z_loss_mlp": 0.14868164, + "step": 3853, + "time_per_iteration": 2.83164119720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069097, + "balance_loss_mlp": 1.05436301, + "epoch": 0.7414390150057715, + "flos": 489293706240.0, + "grad_norm": 0.08701956870254486, + "language_loss": 0.80909944, + "learning_rate": 0.0001653112373997568, + "loss": 0.81979048, + "num_input_tokens_seen": 319683872, + "router_z_loss_mlp": 0.1472168, + "step": 3854, + "time_per_iteration": 2.6991629600524902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072539, + "balance_loss_mlp": 1.057459, + "epoch": 0.7416313966910351, + "flos": 599393613312.0, + "grad_norm": 0.08599035855505702, + "language_loss": 0.7489301, + "learning_rate": 0.0001650798506153517, + "loss": 0.75965548, + "num_input_tokens_seen": 319750032, + "router_z_loss_mlp": 0.1505127, + "step": 3855, + "time_per_iteration": 2.6856653690338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070697, + "balance_loss_mlp": 1.05554581, + "epoch": 0.7418237783762985, + "flos": 542539980288.0, + "grad_norm": 0.07804718077998271, + "language_loss": 0.84300339, + "learning_rate": 0.00016484859385848023, + "loss": 0.85371041, + "num_input_tokens_seen": 319818864, + "router_z_loss_mlp": 0.15124512, + "step": 3856, + "time_per_iteration": 2.637263059616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065824, + "balance_loss_mlp": 1.0510422, + "epoch": 0.7420161600615621, + "flos": 544136071680.0, + "grad_norm": 0.07531615972312422, + "language_loss": 0.77476895, + "learning_rate": 0.0001646174672189243, + "loss": 0.78542721, + "num_input_tokens_seen": 319888816, + "router_z_loss_mlp": 0.14770508, + "step": 3857, + "time_per_iteration": 2.6742300987243652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072034, + "balance_loss_mlp": 1.0568707, + "epoch": 0.7422085417468257, + "flos": 527178875904.0, + "grad_norm": 0.07664417369096119, + "language_loss": 0.80383694, + "learning_rate": 0.00016438647078641488, + "loss": 0.81455731, + "num_input_tokens_seen": 319956176, + "router_z_loss_mlp": 0.15148926, + "step": 3858, + "time_per_iteration": 2.6050353050231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070582, + "balance_loss_mlp": 1.05539477, + "epoch": 0.7424009234320893, + "flos": 508674774528.0, + "grad_norm": 0.07203197801736921, + "language_loss": 0.83144253, + "learning_rate": 0.00016415560465063344, + "loss": 0.84214836, + "num_input_tokens_seen": 320028560, + "router_z_loss_mlp": 0.15161133, + "step": 3859, + "time_per_iteration": 2.7623064517974854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072258, + "balance_loss_mlp": 1.05751181, + "epoch": 0.7425933051173528, + "flos": 512598564864.0, + "grad_norm": 0.06874041780278002, + "language_loss": 0.79038745, + "learning_rate": 0.0001639248689012095, + "loss": 0.80111003, + "num_input_tokens_seen": 320096112, + "router_z_loss_mlp": 0.14733887, + "step": 3860, + "time_per_iteration": 2.5939531326293945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070268, + "balance_loss_mlp": 1.05512857, + "epoch": 0.7427856868026164, + "flos": 458302053888.0, + "grad_norm": 0.07350694530214436, + "language_loss": 0.87617624, + "learning_rate": 0.00016369426362772271, + "loss": 0.88687891, + "num_input_tokens_seen": 320168992, + "router_z_loss_mlp": 0.15136719, + "step": 3861, + "time_per_iteration": 2.8275485038757324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072029, + "balance_loss_mlp": 1.057199, + "epoch": 0.74297806848788, + "flos": 605019580416.0, + "grad_norm": 0.0620300979873649, + "language_loss": 0.80084789, + "learning_rate": 0.00016346378891970233, + "loss": 0.8115682, + "num_input_tokens_seen": 320247264, + "router_z_loss_mlp": 0.14807129, + "step": 3862, + "time_per_iteration": 2.903923988342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078214, + "balance_loss_mlp": 1.06349134, + "epoch": 0.7431704501731435, + "flos": 891390044160.0, + "grad_norm": 0.08373807590972174, + "language_loss": 0.81505513, + "learning_rate": 0.00016323344486662633, + "loss": 0.82583725, + "num_input_tokens_seen": 320338992, + "router_z_loss_mlp": 0.14697266, + "step": 3863, + "time_per_iteration": 3.331378221511841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075731, + "balance_loss_mlp": 1.06079412, + "epoch": 0.7433628318584071, + "flos": 592163841024.0, + "grad_norm": 0.06625022737773377, + "language_loss": 0.78612608, + "learning_rate": 0.00016300323155792247, + "loss": 0.79688334, + "num_input_tokens_seen": 320422096, + "router_z_loss_mlp": 0.14941406, + "step": 3864, + "time_per_iteration": 2.961841583251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070996, + "balance_loss_mlp": 1.05604696, + "epoch": 0.7435552135436706, + "flos": 477154520064.0, + "grad_norm": 0.06667559747166675, + "language_loss": 0.88657868, + "learning_rate": 0.00016277314908296687, + "loss": 0.89728856, + "num_input_tokens_seen": 320492640, + "router_z_loss_mlp": 0.14929199, + "step": 3865, + "time_per_iteration": 2.6347100734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072634, + "balance_loss_mlp": 1.05754232, + "epoch": 0.7437475952289342, + "flos": 673184618496.0, + "grad_norm": 0.09401519790686412, + "language_loss": 0.76145202, + "learning_rate": 0.00016254319753108604, + "loss": 0.77217835, + "num_input_tokens_seen": 320565264, + "router_z_loss_mlp": 0.15075684, + "step": 3866, + "time_per_iteration": 2.87519907951355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069984, + "balance_loss_mlp": 1.05523825, + "epoch": 0.7439399769141978, + "flos": 770428786176.0, + "grad_norm": 0.07662525510674034, + "language_loss": 0.76897246, + "learning_rate": 0.00016231337699155492, + "loss": 0.77967227, + "num_input_tokens_seen": 320647584, + "router_z_loss_mlp": 0.14733887, + "step": 3867, + "time_per_iteration": 2.983419418334961 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074113, + "balance_loss_mlp": 1.05875885, + "epoch": 0.7441323585994614, + "flos": 647777088000.0, + "grad_norm": 0.06858824495499428, + "language_loss": 0.78350103, + "learning_rate": 0.0001620836875535977, + "loss": 0.79424214, + "num_input_tokens_seen": 320722752, + "router_z_loss_mlp": 0.15332031, + "step": 3868, + "time_per_iteration": 2.917475461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074549, + "balance_loss_mlp": 1.05954003, + "epoch": 0.7443247402847248, + "flos": 565372763136.0, + "grad_norm": 0.07353784330896508, + "language_loss": 0.80791712, + "learning_rate": 0.00016185412930638766, + "loss": 0.81866264, + "num_input_tokens_seen": 320802496, + "router_z_loss_mlp": 0.14990234, + "step": 3869, + "time_per_iteration": 2.8665554523468018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073848, + "balance_loss_mlp": 1.05905402, + "epoch": 0.7445171219699884, + "flos": 578529879552.0, + "grad_norm": 0.07383455824846064, + "language_loss": 0.82674599, + "learning_rate": 0.00016162470233904765, + "loss": 0.83748442, + "num_input_tokens_seen": 320872496, + "router_z_loss_mlp": 0.14782715, + "step": 3870, + "time_per_iteration": 2.7601962089538574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074818, + "balance_loss_mlp": 1.05997705, + "epoch": 0.744709503655252, + "flos": 618875997696.0, + "grad_norm": 0.07839397285168428, + "language_loss": 0.82130003, + "learning_rate": 0.00016139540674064856, + "loss": 0.8320483, + "num_input_tokens_seen": 320944992, + "router_z_loss_mlp": 0.14819336, + "step": 3871, + "time_per_iteration": 2.733957290649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076556, + "balance_loss_mlp": 1.0619173, + "epoch": 0.7449018853405156, + "flos": 528619322880.0, + "grad_norm": 0.07038188530007786, + "language_loss": 0.77430081, + "learning_rate": 0.00016116624260021113, + "loss": 0.78506637, + "num_input_tokens_seen": 321020208, + "router_z_loss_mlp": 0.14624023, + "step": 3872, + "time_per_iteration": 2.800231456756592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071128, + "balance_loss_mlp": 1.05622649, + "epoch": 0.7450942670257792, + "flos": 433314842112.0, + "grad_norm": 0.08400374472729004, + "language_loss": 0.83973575, + "learning_rate": 0.0001609372100067046, + "loss": 0.85044706, + "num_input_tokens_seen": 321085984, + "router_z_loss_mlp": 0.14892578, + "step": 3873, + "time_per_iteration": 2.5605225563049316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071204, + "balance_loss_mlp": 1.05576611, + "epoch": 0.7452866487110427, + "flos": 696882258432.0, + "grad_norm": 0.09625530023155114, + "language_loss": 0.84883142, + "learning_rate": 0.0001607083090490475, + "loss": 0.85954344, + "num_input_tokens_seen": 321163200, + "router_z_loss_mlp": 0.1541748, + "step": 3874, + "time_per_iteration": 2.9394900798797607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071613, + "balance_loss_mlp": 1.05664086, + "epoch": 0.7454790303963063, + "flos": 512210552832.0, + "grad_norm": 0.08058298210668029, + "language_loss": 0.79820347, + "learning_rate": 0.00016047953981610714, + "loss": 0.80891967, + "num_input_tokens_seen": 321237328, + "router_z_loss_mlp": 0.14953613, + "step": 3875, + "time_per_iteration": 2.7139272689819336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021273, + "balance_loss_mlp": 1.01435852, + "epoch": 0.7456714120815698, + "flos": 1325949668352.0, + "grad_norm": 0.011941727483725444, + "language_loss": 0.7972964, + "learning_rate": 0.00016025090239669916, + "loss": 0.80750912, + "num_input_tokens_seen": 321456192, + "router_z_loss_mlp": 0.06933594, + "step": 3876, + "time_per_iteration": 4.9671149253845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076145, + "balance_loss_mlp": 1.06137538, + "epoch": 0.7458637937668334, + "flos": 721711627776.0, + "grad_norm": 0.07051283090717735, + "language_loss": 0.80756205, + "learning_rate": 0.0001600223968795889, + "loss": 0.81832355, + "num_input_tokens_seen": 321530560, + "router_z_loss_mlp": 0.14758301, + "step": 3877, + "time_per_iteration": 2.9416282176971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020704, + "balance_loss_mlp": 1.01379037, + "epoch": 0.746056175452097, + "flos": 1501580395008.0, + "grad_norm": 0.011482501801392642, + "language_loss": 0.75696075, + "learning_rate": 0.00015979402335349004, + "loss": 0.76716781, + "num_input_tokens_seen": 321760928, + "router_z_loss_mlp": 0.06933594, + "step": 3878, + "time_per_iteration": 4.898989677429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075351, + "balance_loss_mlp": 1.06042576, + "epoch": 0.7462485571373605, + "flos": 520245711360.0, + "grad_norm": 0.0885851208026398, + "language_loss": 0.81747985, + "learning_rate": 0.00015956578190706483, + "loss": 0.82823336, + "num_input_tokens_seen": 321833248, + "router_z_loss_mlp": 0.14904785, + "step": 3879, + "time_per_iteration": 2.6805362701416016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066599, + "balance_loss_mlp": 1.05175781, + "epoch": 0.7464409388226241, + "flos": 481206790656.0, + "grad_norm": 0.07723337503848805, + "language_loss": 0.75796139, + "learning_rate": 0.00015933767262892468, + "loss": 0.76862741, + "num_input_tokens_seen": 321905904, + "router_z_loss_mlp": 0.14831543, + "step": 3880, + "time_per_iteration": 2.7313079833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107623, + "balance_loss_mlp": 1.06153107, + "epoch": 0.7466333205078877, + "flos": 486761177088.0, + "grad_norm": 0.08607243998977276, + "language_loss": 0.82115054, + "learning_rate": 0.00015910969560762927, + "loss": 0.83191288, + "num_input_tokens_seen": 321971920, + "router_z_loss_mlp": 0.14685059, + "step": 3881, + "time_per_iteration": 2.633983612060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074372, + "balance_loss_mlp": 1.05975699, + "epoch": 0.7468257021931513, + "flos": 611293091328.0, + "grad_norm": 0.07136699104926861, + "language_loss": 0.83270466, + "learning_rate": 0.00015888185093168727, + "loss": 0.8434484, + "num_input_tokens_seen": 322041904, + "router_z_loss_mlp": 0.14587402, + "step": 3882, + "time_per_iteration": 2.7481329441070557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107382, + "balance_loss_mlp": 1.05850148, + "epoch": 0.7470180838784147, + "flos": 533459727360.0, + "grad_norm": 0.07343335643807868, + "language_loss": 0.81235325, + "learning_rate": 0.00015865413868955581, + "loss": 0.82309145, + "num_input_tokens_seen": 322110816, + "router_z_loss_mlp": 0.15319824, + "step": 3883, + "time_per_iteration": 2.651353120803833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064128, + "balance_loss_mlp": 1.04927468, + "epoch": 0.7472104655636783, + "flos": 739338388992.0, + "grad_norm": 0.07434119530275363, + "language_loss": 0.82377172, + "learning_rate": 0.00015842655896964054, + "loss": 0.83441293, + "num_input_tokens_seen": 322192704, + "router_z_loss_mlp": 0.14831543, + "step": 3884, + "time_per_iteration": 3.050145149230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071848, + "balance_loss_mlp": 1.05682731, + "epoch": 0.7474028472489419, + "flos": 640305409536.0, + "grad_norm": 0.06949199138359925, + "language_loss": 0.73357499, + "learning_rate": 0.00015819911186029567, + "loss": 0.74429345, + "num_input_tokens_seen": 322263888, + "router_z_loss_mlp": 0.14990234, + "step": 3885, + "time_per_iteration": 2.8004651069641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073306, + "balance_loss_mlp": 1.05846465, + "epoch": 0.7475952289342055, + "flos": 590249120256.0, + "grad_norm": 0.07642701531837649, + "language_loss": 0.86417222, + "learning_rate": 0.00015797179744982443, + "loss": 0.87490523, + "num_input_tokens_seen": 322331936, + "router_z_loss_mlp": 0.14831543, + "step": 3886, + "time_per_iteration": 2.7258265018463135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071859, + "balance_loss_mlp": 1.05695772, + "epoch": 0.7477876106194691, + "flos": 488191712256.0, + "grad_norm": 0.06842586328042619, + "language_loss": 0.78899908, + "learning_rate": 0.00015774461582647765, + "loss": 0.79971766, + "num_input_tokens_seen": 322402032, + "router_z_loss_mlp": 0.14868164, + "step": 3887, + "time_per_iteration": 2.6812551021575928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067811, + "balance_loss_mlp": 1.0530405, + "epoch": 0.7479799923047326, + "flos": 554733494784.0, + "grad_norm": 0.07125585996553076, + "language_loss": 0.81060201, + "learning_rate": 0.00015751756707845505, + "loss": 0.82128012, + "num_input_tokens_seen": 322472512, + "router_z_loss_mlp": 0.14746094, + "step": 3888, + "time_per_iteration": 2.6297996044158936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076062, + "balance_loss_mlp": 1.06092191, + "epoch": 0.7481723739899961, + "flos": 767387105280.0, + "grad_norm": 0.06726063528868273, + "language_loss": 0.87855798, + "learning_rate": 0.00015729065129390502, + "loss": 0.88931859, + "num_input_tokens_seen": 322555104, + "router_z_loss_mlp": 0.15112305, + "step": 3889, + "time_per_iteration": 3.0159242153167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074487, + "balance_loss_mlp": 1.0599674, + "epoch": 0.7483647556752597, + "flos": 496172542464.0, + "grad_norm": 0.10838691697932842, + "language_loss": 0.8232426, + "learning_rate": 0.0001570638685609241, + "loss": 0.83398747, + "num_input_tokens_seen": 322621904, + "router_z_loss_mlp": 0.14501953, + "step": 3890, + "time_per_iteration": 2.6125991344451904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107077, + "balance_loss_mlp": 1.05557132, + "epoch": 0.7485571373605233, + "flos": 472850431488.0, + "grad_norm": 0.08016904552002256, + "language_loss": 0.80514675, + "learning_rate": 0.00015683721896755693, + "loss": 0.81585443, + "num_input_tokens_seen": 322688928, + "router_z_loss_mlp": 0.1517334, + "step": 3891, + "time_per_iteration": 2.5569143295288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013078, + "balance_loss_mlp": 1.00649726, + "epoch": 0.7487495190457868, + "flos": 1554473161728.0, + "grad_norm": 0.007010152707011992, + "language_loss": 0.82210493, + "learning_rate": 0.00015661070260179682, + "loss": 0.83223569, + "num_input_tokens_seen": 322928464, + "router_z_loss_mlp": 0.06591797, + "step": 3892, + "time_per_iteration": 4.943824052810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071888, + "balance_loss_mlp": 1.05717778, + "epoch": 0.7489419007310504, + "flos": 581845773312.0, + "grad_norm": 0.07161004761849712, + "language_loss": 0.85438764, + "learning_rate": 0.00015638431955158528, + "loss": 0.86510646, + "num_input_tokens_seen": 323002672, + "router_z_loss_mlp": 0.14697266, + "step": 3893, + "time_per_iteration": 2.7677009105682373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072508, + "balance_loss_mlp": 1.057881, + "epoch": 0.749134282416314, + "flos": 567576751104.0, + "grad_norm": 0.06398748480098922, + "language_loss": 0.80855525, + "learning_rate": 0.00015615806990481186, + "loss": 0.81928039, + "num_input_tokens_seen": 323076480, + "router_z_loss_mlp": 0.14611816, + "step": 3894, + "time_per_iteration": 2.7350878715515137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074761, + "balance_loss_mlp": 1.05946612, + "epoch": 0.7493266641015776, + "flos": 533061803520.0, + "grad_norm": 0.06232267001853924, + "language_loss": 0.84572965, + "learning_rate": 0.00015593195374931452, + "loss": 0.85647732, + "num_input_tokens_seen": 323151840, + "router_z_loss_mlp": 0.15270996, + "step": 3895, + "time_per_iteration": 2.7310590744018555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070156, + "balance_loss_mlp": 1.05569601, + "epoch": 0.7495190457868411, + "flos": 523613362176.0, + "grad_norm": 0.10454645503772597, + "language_loss": 0.80334634, + "learning_rate": 0.00015570597117287922, + "loss": 0.81404787, + "num_input_tokens_seen": 323223376, + "router_z_loss_mlp": 0.14453125, + "step": 3896, + "time_per_iteration": 2.6727263927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069845, + "balance_loss_mlp": 1.05496776, + "epoch": 0.7497114274721046, + "flos": 514187315712.0, + "grad_norm": 0.08797347720338106, + "language_loss": 0.77618623, + "learning_rate": 0.0001554801222632406, + "loss": 0.78688467, + "num_input_tokens_seen": 323290288, + "router_z_loss_mlp": 0.14868164, + "step": 3897, + "time_per_iteration": 2.625335931777954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073048, + "balance_loss_mlp": 1.05813491, + "epoch": 0.7499038091573682, + "flos": 495006308352.0, + "grad_norm": 0.06959868179345496, + "language_loss": 0.85080492, + "learning_rate": 0.00015525440710808052, + "loss": 0.86153543, + "num_input_tokens_seen": 323359568, + "router_z_loss_mlp": 0.14892578, + "step": 3898, + "time_per_iteration": 2.613952875137329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068734, + "balance_loss_mlp": 1.05366588, + "epoch": 0.7500961908426318, + "flos": 737658233856.0, + "grad_norm": 0.08867238273395864, + "language_loss": 0.77680039, + "learning_rate": 0.00015502882579502953, + "loss": 0.78748775, + "num_input_tokens_seen": 323436688, + "router_z_loss_mlp": 0.1505127, + "step": 3899, + "time_per_iteration": 2.960392951965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068744, + "balance_loss_mlp": 1.05395043, + "epoch": 0.7502885725278954, + "flos": 533400256512.0, + "grad_norm": 0.06630940736811984, + "language_loss": 0.84808308, + "learning_rate": 0.00015480337841166592, + "loss": 0.85877049, + "num_input_tokens_seen": 323510032, + "router_z_loss_mlp": 0.14770508, + "step": 3900, + "time_per_iteration": 2.732886552810669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_mlp": 1.06013203, + "epoch": 0.7504809542131589, + "flos": 589324792320.0, + "grad_norm": 0.12261327498313113, + "language_loss": 0.82749909, + "learning_rate": 0.00015457806504551647, + "loss": 0.8382417, + "num_input_tokens_seen": 323588896, + "router_z_loss_mlp": 0.14135742, + "step": 3901, + "time_per_iteration": 2.8467769622802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072304, + "balance_loss_mlp": 1.05728328, + "epoch": 0.7506733358984224, + "flos": 511550899200.0, + "grad_norm": 0.06753844274961214, + "language_loss": 0.77791429, + "learning_rate": 0.0001543528857840554, + "loss": 0.78863734, + "num_input_tokens_seen": 323661280, + "router_z_loss_mlp": 0.14990234, + "step": 3902, + "time_per_iteration": 2.6523211002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071569, + "balance_loss_mlp": 1.05731118, + "epoch": 0.750865717583686, + "flos": 539268503040.0, + "grad_norm": 0.09602264980762555, + "language_loss": 0.80487525, + "learning_rate": 0.000154127840714705, + "loss": 0.81559098, + "num_input_tokens_seen": 323739200, + "router_z_loss_mlp": 0.14257812, + "step": 3903, + "time_per_iteration": 2.8009955883026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068794, + "balance_loss_mlp": 1.05425048, + "epoch": 0.7510580992689496, + "flos": 476578930176.0, + "grad_norm": 0.08455294978842176, + "language_loss": 0.82418245, + "learning_rate": 0.00015390292992483557, + "loss": 0.8348704, + "num_input_tokens_seen": 323802816, + "router_z_loss_mlp": 0.1451416, + "step": 3904, + "time_per_iteration": 2.5455572605133057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071227, + "balance_loss_mlp": 1.05671942, + "epoch": 0.7512504809542132, + "flos": 579043800576.0, + "grad_norm": 0.08735450092332898, + "language_loss": 0.84165967, + "learning_rate": 0.00015367815350176523, + "loss": 0.85237193, + "num_input_tokens_seen": 323879488, + "router_z_loss_mlp": 0.14501953, + "step": 3905, + "time_per_iteration": 2.7836532592773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067778, + "balance_loss_mlp": 1.05274606, + "epoch": 0.7514428626394767, + "flos": 418660379136.0, + "grad_norm": 0.07770341183537, + "language_loss": 0.82813609, + "learning_rate": 0.00015345351153275987, + "loss": 0.83881384, + "num_input_tokens_seen": 323944512, + "router_z_loss_mlp": 0.15002441, + "step": 3906, + "time_per_iteration": 2.5773417949676514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073295, + "balance_loss_mlp": 1.05825067, + "epoch": 0.7516352443247403, + "flos": 641039215104.0, + "grad_norm": 0.06258787162872337, + "language_loss": 0.80409896, + "learning_rate": 0.00015322900410503332, + "loss": 0.81483191, + "num_input_tokens_seen": 324020688, + "router_z_loss_mlp": 0.15026855, + "step": 3907, + "time_per_iteration": 2.8312478065490723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068679, + "balance_loss_mlp": 1.05444527, + "epoch": 0.7518276260100039, + "flos": 580998168576.0, + "grad_norm": 0.07094809333059562, + "language_loss": 0.77488625, + "learning_rate": 0.00015300463130574703, + "loss": 0.78557301, + "num_input_tokens_seen": 324098080, + "router_z_loss_mlp": 0.14245605, + "step": 3908, + "time_per_iteration": 4.43429160118103 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073575, + "balance_loss_mlp": 1.05884063, + "epoch": 0.7520200076952674, + "flos": 687342412800.0, + "grad_norm": 0.07651069808134531, + "language_loss": 0.81860089, + "learning_rate": 0.00015278039322201033, + "loss": 0.82933658, + "num_input_tokens_seen": 324183968, + "router_z_loss_mlp": 0.1472168, + "step": 3909, + "time_per_iteration": 2.999046564102173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069479, + "balance_loss_mlp": 1.05531669, + "epoch": 0.7522123893805309, + "flos": 486439976448.0, + "grad_norm": 0.09637101763600625, + "language_loss": 0.79630423, + "learning_rate": 0.00015255628994088004, + "loss": 0.80699903, + "num_input_tokens_seen": 324249568, + "router_z_loss_mlp": 0.14160156, + "step": 3910, + "time_per_iteration": 2.5875840187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075904, + "balance_loss_mlp": 1.06090784, + "epoch": 0.7524047710657945, + "flos": 818982586368.0, + "grad_norm": 0.10609317146357068, + "language_loss": 0.75265759, + "learning_rate": 0.00015233232154936082, + "loss": 0.76341665, + "num_input_tokens_seen": 324345312, + "router_z_loss_mlp": 0.14978027, + "step": 3911, + "time_per_iteration": 3.2563164234161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073648, + "balance_loss_mlp": 1.05859196, + "epoch": 0.7525971527510581, + "flos": 699508763136.0, + "grad_norm": 0.11854285995885969, + "language_loss": 0.76211643, + "learning_rate": 0.0001521084881344048, + "loss": 0.7728529, + "num_input_tokens_seen": 324419056, + "router_z_loss_mlp": 0.15039062, + "step": 3912, + "time_per_iteration": 2.867192029953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107173, + "balance_loss_mlp": 1.05700815, + "epoch": 0.7527895344363217, + "flos": 633787421184.0, + "grad_norm": 0.0664126315840823, + "language_loss": 0.86507452, + "learning_rate": 0.00015188478978291208, + "loss": 0.87579179, + "num_input_tokens_seen": 324490848, + "router_z_loss_mlp": 0.14697266, + "step": 3913, + "time_per_iteration": 2.80972957611084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071284, + "balance_loss_mlp": 1.05657363, + "epoch": 0.7529819161215853, + "flos": 562830322176.0, + "grad_norm": 0.08394603234641039, + "language_loss": 0.86425006, + "learning_rate": 0.00015166122658173014, + "loss": 0.87496293, + "num_input_tokens_seen": 324565648, + "router_z_loss_mlp": 0.14697266, + "step": 3914, + "time_per_iteration": 2.8178954124450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106559, + "balance_loss_mlp": 1.05062914, + "epoch": 0.7531742978068487, + "flos": 690665647104.0, + "grad_norm": 0.08202440069993752, + "language_loss": 0.88477957, + "learning_rate": 0.00015143779861765332, + "loss": 0.89543545, + "num_input_tokens_seen": 324642832, + "router_z_loss_mlp": 0.14953613, + "step": 3915, + "time_per_iteration": 2.920933961868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066546, + "balance_loss_mlp": 1.05214548, + "epoch": 0.7533666794921123, + "flos": 681101208576.0, + "grad_norm": 0.09013491853638725, + "language_loss": 0.81357694, + "learning_rate": 0.00015121450597742458, + "loss": 0.82424241, + "num_input_tokens_seen": 324718336, + "router_z_loss_mlp": 0.14379883, + "step": 3916, + "time_per_iteration": 2.858567714691162 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069931, + "balance_loss_mlp": 1.05475569, + "epoch": 0.7535590611773759, + "flos": 623669414400.0, + "grad_norm": 0.07911049580666238, + "language_loss": 0.78366303, + "learning_rate": 0.00015099134874773369, + "loss": 0.79436231, + "num_input_tokens_seen": 324787744, + "router_z_loss_mlp": 0.15148926, + "step": 3917, + "time_per_iteration": 2.7468197345733643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064775, + "balance_loss_mlp": 1.04978991, + "epoch": 0.7537514428626395, + "flos": 519427842048.0, + "grad_norm": 0.06092774973766905, + "language_loss": 0.79940081, + "learning_rate": 0.00015076832701521793, + "loss": 0.81004852, + "num_input_tokens_seen": 324863280, + "router_z_loss_mlp": 0.14953613, + "step": 3918, + "time_per_iteration": 2.7390952110290527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067305, + "balance_loss_mlp": 1.05246365, + "epoch": 0.753943824547903, + "flos": 723653512704.0, + "grad_norm": 0.0783583526919487, + "language_loss": 0.81940138, + "learning_rate": 0.000150545440866462, + "loss": 0.83007443, + "num_input_tokens_seen": 324949600, + "router_z_loss_mlp": 0.14831543, + "step": 3919, + "time_per_iteration": 2.999077558517456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074124, + "balance_loss_mlp": 1.05919874, + "epoch": 0.7541362062331666, + "flos": 437547350016.0, + "grad_norm": 0.13926190935015714, + "language_loss": 0.78544766, + "learning_rate": 0.000150322690387998, + "loss": 0.79618883, + "num_input_tokens_seen": 325013808, + "router_z_loss_mlp": 0.14904785, + "step": 3920, + "time_per_iteration": 2.5101473331451416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068636, + "balance_loss_mlp": 1.05396128, + "epoch": 0.7543285879184302, + "flos": 565274018304.0, + "grad_norm": 0.07965418146183906, + "language_loss": 0.75046188, + "learning_rate": 0.00015010007566630535, + "loss": 0.76114827, + "num_input_tokens_seen": 325084832, + "router_z_loss_mlp": 0.14648438, + "step": 3921, + "time_per_iteration": 2.741964101791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067461, + "balance_loss_mlp": 1.05235684, + "epoch": 0.7545209696036937, + "flos": 521036416512.0, + "grad_norm": 0.11708553927616697, + "language_loss": 0.81529438, + "learning_rate": 0.00014987759678781077, + "loss": 0.82596898, + "num_input_tokens_seen": 325155120, + "router_z_loss_mlp": 0.15087891, + "step": 3922, + "time_per_iteration": 2.648132562637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066406, + "balance_loss_mlp": 1.05142117, + "epoch": 0.7547133512889573, + "flos": 616066684416.0, + "grad_norm": 0.07358029420700156, + "language_loss": 0.82236576, + "learning_rate": 0.00014965525383888795, + "loss": 0.83302975, + "num_input_tokens_seen": 325235632, + "router_z_loss_mlp": 0.1496582, + "step": 3923, + "time_per_iteration": 2.8054702281951904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064259, + "balance_loss_mlp": 1.04944098, + "epoch": 0.7549057329742208, + "flos": 750845085696.0, + "grad_norm": 0.064367666918871, + "language_loss": 0.72265434, + "learning_rate": 0.00014943304690585851, + "loss": 0.73329699, + "num_input_tokens_seen": 325309696, + "router_z_loss_mlp": 0.14794922, + "step": 3924, + "time_per_iteration": 2.9498682022094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070108, + "balance_loss_mlp": 1.0550642, + "epoch": 0.7550981146594844, + "flos": 514444276224.0, + "grad_norm": 0.09368808464599959, + "language_loss": 0.78766346, + "learning_rate": 0.0001492109760749908, + "loss": 0.79836458, + "num_input_tokens_seen": 325375744, + "router_z_loss_mlp": 0.15026855, + "step": 3925, + "time_per_iteration": 2.6306443214416504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070609, + "balance_loss_mlp": 1.05580342, + "epoch": 0.755290496344748, + "flos": 522009930240.0, + "grad_norm": 0.06789463297635422, + "language_loss": 0.79637897, + "learning_rate": 0.00014898904143250002, + "loss": 0.8070851, + "num_input_tokens_seen": 325448384, + "router_z_loss_mlp": 0.14770508, + "step": 3926, + "time_per_iteration": 2.675294876098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021023, + "balance_loss_mlp": 1.01463342, + "epoch": 0.7554828780300116, + "flos": 1414615021056.0, + "grad_norm": 0.012183216489225542, + "language_loss": 0.75755203, + "learning_rate": 0.00014876724306454886, + "loss": 0.76776218, + "num_input_tokens_seen": 325678672, + "router_z_loss_mlp": 0.06396484, + "step": 3927, + "time_per_iteration": 4.929020166397095 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077252, + "balance_loss_mlp": 1.06279135, + "epoch": 0.7556752597152752, + "flos": 556937482752.0, + "grad_norm": 0.08141405861107236, + "language_loss": 0.79880834, + "learning_rate": 0.0001485455810572474, + "loss": 0.80958086, + "num_input_tokens_seen": 325746656, + "router_z_loss_mlp": 0.14453125, + "step": 3928, + "time_per_iteration": 2.6965065002441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081004, + "balance_loss_mlp": 1.06655574, + "epoch": 0.7558676414005386, + "flos": 563638279680.0, + "grad_norm": 0.061395348363909676, + "language_loss": 0.84046453, + "learning_rate": 0.00014832405549665236, + "loss": 0.85127461, + "num_input_tokens_seen": 325820304, + "router_z_loss_mlp": 0.14440918, + "step": 3929, + "time_per_iteration": 2.7616498470306396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108063, + "balance_loss_mlp": 1.06583571, + "epoch": 0.7560600230858022, + "flos": 561377392128.0, + "grad_norm": 0.07976690960726483, + "language_loss": 0.7883532, + "learning_rate": 0.00014810266646876746, + "loss": 0.79915947, + "num_input_tokens_seen": 325895584, + "router_z_loss_mlp": 0.14794922, + "step": 3930, + "time_per_iteration": 2.778254747390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080018, + "balance_loss_mlp": 1.06545067, + "epoch": 0.7562524047710658, + "flos": 719576649216.0, + "grad_norm": 0.08838808443584828, + "language_loss": 0.7752986, + "learning_rate": 0.00014788141405954364, + "loss": 0.78609884, + "num_input_tokens_seen": 325976752, + "router_z_loss_mlp": 0.14538574, + "step": 3931, + "time_per_iteration": 3.053114891052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078954, + "balance_loss_mlp": 1.06433892, + "epoch": 0.7564447864563294, + "flos": 543347937792.0, + "grad_norm": 0.08282527395338529, + "language_loss": 0.85036647, + "learning_rate": 0.00014766029835487865, + "loss": 0.86115611, + "num_input_tokens_seen": 326047152, + "router_z_loss_mlp": 0.14599609, + "step": 3932, + "time_per_iteration": 2.7713563442230225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088765, + "balance_loss_mlp": 1.07401931, + "epoch": 0.7566371681415929, + "flos": 725805743616.0, + "grad_norm": 0.09534253325991678, + "language_loss": 0.79310846, + "learning_rate": 0.0001474393194406173, + "loss": 0.80399615, + "num_input_tokens_seen": 326119056, + "router_z_loss_mlp": 0.14733887, + "step": 3933, + "time_per_iteration": 2.9569146633148193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081686, + "balance_loss_mlp": 1.06690359, + "epoch": 0.7568295498268565, + "flos": 576580280832.0, + "grad_norm": 0.06403839142600674, + "language_loss": 0.79947335, + "learning_rate": 0.00014721847740255112, + "loss": 0.81029022, + "num_input_tokens_seen": 326196736, + "router_z_loss_mlp": 0.14758301, + "step": 3934, + "time_per_iteration": 2.835782766342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01018298, + "balance_loss_mlp": 1.01190841, + "epoch": 0.75702193151212, + "flos": 1520059903488.0, + "grad_norm": 0.010361786732504343, + "language_loss": 0.73911923, + "learning_rate": 0.00014699777232641853, + "loss": 0.74930221, + "num_input_tokens_seen": 326404752, + "router_z_loss_mlp": 0.06396484, + "step": 3935, + "time_per_iteration": 4.648789167404175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079127, + "balance_loss_mlp": 1.06448829, + "epoch": 0.7572143131973836, + "flos": 525471556608.0, + "grad_norm": 0.08867148183843263, + "language_loss": 0.78082466, + "learning_rate": 0.00014677720429790526, + "loss": 0.79161596, + "num_input_tokens_seen": 326472832, + "router_z_loss_mlp": 0.1463623, + "step": 3936, + "time_per_iteration": 2.620413064956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073911, + "balance_loss_mlp": 1.05959392, + "epoch": 0.7574066948826472, + "flos": 550738123776.0, + "grad_norm": 0.05592268057464008, + "language_loss": 0.84353757, + "learning_rate": 0.0001465567734026429, + "loss": 0.85427672, + "num_input_tokens_seen": 326546976, + "router_z_loss_mlp": 0.14306641, + "step": 3937, + "time_per_iteration": 2.725707769393921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075578, + "balance_loss_mlp": 1.06070113, + "epoch": 0.7575990765679107, + "flos": 395899176960.0, + "grad_norm": 0.08941460340947252, + "language_loss": 0.82362223, + "learning_rate": 0.00014633647972621034, + "loss": 0.834378, + "num_input_tokens_seen": 326609296, + "router_z_loss_mlp": 0.14868164, + "step": 3938, + "time_per_iteration": 2.4839630126953125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073907, + "balance_loss_mlp": 1.05917263, + "epoch": 0.7577914582531743, + "flos": 585030615552.0, + "grad_norm": 0.06669605761909986, + "language_loss": 0.8624711, + "learning_rate": 0.00014611632335413354, + "loss": 0.87321013, + "num_input_tokens_seen": 326687168, + "router_z_loss_mlp": 0.14709473, + "step": 3939, + "time_per_iteration": 2.791856527328491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071326, + "balance_loss_mlp": 1.05647278, + "epoch": 0.7579838399384379, + "flos": 820979172864.0, + "grad_norm": 0.06597748597273165, + "language_loss": 0.82603717, + "learning_rate": 0.00014589630437188456, + "loss": 0.83675039, + "num_input_tokens_seen": 326777760, + "router_z_loss_mlp": 0.14831543, + "step": 3940, + "time_per_iteration": 3.1954329013824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081609, + "balance_loss_mlp": 1.06717253, + "epoch": 0.7581762216237015, + "flos": 443892441600.0, + "grad_norm": 0.08139847599649805, + "language_loss": 0.78537852, + "learning_rate": 0.00014567642286488253, + "loss": 0.79619455, + "num_input_tokens_seen": 326843952, + "router_z_loss_mlp": 0.14428711, + "step": 3941, + "time_per_iteration": 2.5560035705566406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074149, + "balance_loss_mlp": 1.05904555, + "epoch": 0.7583686033089649, + "flos": 540886989312.0, + "grad_norm": 0.08568953404097215, + "language_loss": 0.79163635, + "learning_rate": 0.00014545667891849258, + "loss": 0.80237788, + "num_input_tokens_seen": 326911296, + "router_z_loss_mlp": 0.15100098, + "step": 3942, + "time_per_iteration": 2.6567327976226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073928, + "balance_loss_mlp": 1.05890775, + "epoch": 0.7585609849942285, + "flos": 522588091392.0, + "grad_norm": 0.08481557046486428, + "language_loss": 0.82241571, + "learning_rate": 0.00014523707261802733, + "loss": 0.83315504, + "num_input_tokens_seen": 326977776, + "router_z_loss_mlp": 0.14990234, + "step": 3943, + "time_per_iteration": 2.6527955532073975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107533, + "balance_loss_mlp": 1.0604291, + "epoch": 0.7587533666794921, + "flos": 541860503040.0, + "grad_norm": 0.07206762548440185, + "language_loss": 0.81135172, + "learning_rate": 0.00014501760404874527, + "loss": 0.82210505, + "num_input_tokens_seen": 327050240, + "router_z_loss_mlp": 0.14868164, + "step": 3944, + "time_per_iteration": 2.750162124633789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072936, + "balance_loss_mlp": 1.05839252, + "epoch": 0.7589457483647557, + "flos": 606408270336.0, + "grad_norm": 0.07713855070396991, + "language_loss": 0.85369128, + "learning_rate": 0.00014479827329585176, + "loss": 0.86442065, + "num_input_tokens_seen": 327119952, + "router_z_loss_mlp": 0.14538574, + "step": 3945, + "time_per_iteration": 2.755915641784668 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069268, + "balance_loss_mlp": 1.05458164, + "epoch": 0.7591381300500193, + "flos": 555106452480.0, + "grad_norm": 0.06696753824594462, + "language_loss": 0.84734225, + "learning_rate": 0.00014457908044449846, + "loss": 0.85803485, + "num_input_tokens_seen": 327192640, + "router_z_loss_mlp": 0.14685059, + "step": 3946, + "time_per_iteration": 2.794527292251587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066827, + "balance_loss_mlp": 1.05199742, + "epoch": 0.7593305117352828, + "flos": 529681669632.0, + "grad_norm": 0.22019165201908963, + "language_loss": 0.8300361, + "learning_rate": 0.00014436002557978371, + "loss": 0.84070432, + "num_input_tokens_seen": 327271008, + "router_z_loss_mlp": 0.14794922, + "step": 3947, + "time_per_iteration": 2.7884273529052734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011645, + "balance_loss_mlp": 1.00530314, + "epoch": 0.7595228934205464, + "flos": 1502798759424.0, + "grad_norm": 0.007510047355142999, + "language_loss": 0.76643145, + "learning_rate": 0.00014414110878675201, + "loss": 0.77654791, + "num_input_tokens_seen": 327505392, + "router_z_loss_mlp": 0.06347656, + "step": 3948, + "time_per_iteration": 4.91646671295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071151, + "balance_loss_mlp": 1.05669069, + "epoch": 0.7597152751058099, + "flos": 455525047296.0, + "grad_norm": 0.06244939704933972, + "language_loss": 0.79716647, + "learning_rate": 0.0001439223301503945, + "loss": 0.80787796, + "num_input_tokens_seen": 327569392, + "router_z_loss_mlp": 0.14440918, + "step": 3949, + "time_per_iteration": 2.5492866039276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071573, + "balance_loss_mlp": 1.05702949, + "epoch": 0.7599076567910735, + "flos": 685466966016.0, + "grad_norm": 0.07710584125137034, + "language_loss": 0.76199448, + "learning_rate": 0.00014370368975564834, + "loss": 0.7727102, + "num_input_tokens_seen": 327648304, + "router_z_loss_mlp": 0.14526367, + "step": 3950, + "time_per_iteration": 2.9614758491516113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107362, + "balance_loss_mlp": 1.05911183, + "epoch": 0.760100038476337, + "flos": 532372414464.0, + "grad_norm": 0.08081333921040441, + "language_loss": 0.83668613, + "learning_rate": 0.00014348518768739766, + "loss": 0.84742236, + "num_input_tokens_seen": 327725600, + "router_z_loss_mlp": 0.14477539, + "step": 3951, + "time_per_iteration": 2.7431232929229736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013923, + "balance_loss_mlp": 1.00758147, + "epoch": 0.7602924201616006, + "flos": 1471742866944.0, + "grad_norm": 0.008820182172653682, + "language_loss": 0.7672804, + "learning_rate": 0.00014326682403047243, + "loss": 0.77741963, + "num_input_tokens_seen": 327954048, + "router_z_loss_mlp": 0.06347656, + "step": 3952, + "time_per_iteration": 4.910484790802002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066227, + "balance_loss_mlp": 1.05181456, + "epoch": 0.7604848018468642, + "flos": 774631558656.0, + "grad_norm": 0.06688586461179376, + "language_loss": 0.86683798, + "learning_rate": 0.00014304859886964867, + "loss": 0.8775003, + "num_input_tokens_seen": 328034656, + "router_z_loss_mlp": 0.14416504, + "step": 3953, + "time_per_iteration": 3.0533196926116943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106398, + "balance_loss_mlp": 1.04932904, + "epoch": 0.7606771835321278, + "flos": 558185209344.0, + "grad_norm": 0.08095687676459093, + "language_loss": 0.83446729, + "learning_rate": 0.00014283051228964878, + "loss": 0.84510708, + "num_input_tokens_seen": 328107264, + "router_z_loss_mlp": 0.14624023, + "step": 3954, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063032, + "balance_loss_mlp": 1.04855967, + "epoch": 0.7608695652173914, + "flos": 525397404672.0, + "grad_norm": 0.07612254012233202, + "language_loss": 0.82667398, + "learning_rate": 0.00014261256437514197, + "loss": 0.83730423, + "num_input_tokens_seen": 328177168, + "router_z_loss_mlp": 0.14477539, + "step": 3955, + "time_per_iteration": 2.635387897491455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064275, + "balance_loss_mlp": 1.05008912, + "epoch": 0.7610619469026548, + "flos": 615038842368.0, + "grad_norm": 0.07371649985569284, + "language_loss": 0.82440156, + "learning_rate": 0.0001423947552107428, + "loss": 0.83504432, + "num_input_tokens_seen": 328245360, + "router_z_loss_mlp": 0.14196777, + "step": 3956, + "time_per_iteration": 2.737157106399536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066435, + "balance_loss_mlp": 1.05186772, + "epoch": 0.7612543285879184, + "flos": 863356382208.0, + "grad_norm": 0.0738552534680633, + "language_loss": 0.76961863, + "learning_rate": 0.00014217708488101243, + "loss": 0.78028303, + "num_input_tokens_seen": 328326560, + "router_z_loss_mlp": 0.14575195, + "step": 3957, + "time_per_iteration": 3.0698153972625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071692, + "balance_loss_mlp": 1.05686283, + "epoch": 0.761446710273182, + "flos": 553658664960.0, + "grad_norm": 0.08088514343400555, + "language_loss": 0.77329475, + "learning_rate": 0.0001419595534704579, + "loss": 0.78401166, + "num_input_tokens_seen": 328395760, + "router_z_loss_mlp": 0.14807129, + "step": 3958, + "time_per_iteration": 2.714460611343384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072652, + "balance_loss_mlp": 1.05804873, + "epoch": 0.7616390919584456, + "flos": 467350373376.0, + "grad_norm": 0.07036888376906092, + "language_loss": 0.81419134, + "learning_rate": 0.00014174216106353237, + "loss": 0.82491785, + "num_input_tokens_seen": 328464560, + "router_z_loss_mlp": 0.14575195, + "step": 3959, + "time_per_iteration": 2.6297383308410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071387, + "balance_loss_mlp": 1.05678439, + "epoch": 0.7618314736437091, + "flos": 498430858752.0, + "grad_norm": 0.08691180014870155, + "language_loss": 0.76360267, + "learning_rate": 0.00014152490774463512, + "loss": 0.77431655, + "num_input_tokens_seen": 328532640, + "router_z_loss_mlp": 0.14599609, + "step": 3960, + "time_per_iteration": 2.650489568710327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068154, + "balance_loss_mlp": 1.05350327, + "epoch": 0.7620238553289727, + "flos": 434545316352.0, + "grad_norm": 0.10467283388045306, + "language_loss": 0.87295103, + "learning_rate": 0.00014130779359811135, + "loss": 0.88363254, + "num_input_tokens_seen": 328595392, + "router_z_loss_mlp": 0.14624023, + "step": 3961, + "time_per_iteration": 2.470933437347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069904, + "balance_loss_mlp": 1.05552769, + "epoch": 0.7622162370142362, + "flos": 664277262336.0, + "grad_norm": 0.06695122847081907, + "language_loss": 0.85981679, + "learning_rate": 0.0001410908187082521, + "loss": 0.87051582, + "num_input_tokens_seen": 328676368, + "router_z_loss_mlp": 0.14379883, + "step": 3962, + "time_per_iteration": 2.8771471977233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068735, + "balance_loss_mlp": 1.05375075, + "epoch": 0.7624086186994998, + "flos": 557965324800.0, + "grad_norm": 0.0731663524296794, + "language_loss": 0.83243585, + "learning_rate": 0.0001408739831592949, + "loss": 0.8431232, + "num_input_tokens_seen": 328745136, + "router_z_loss_mlp": 0.14953613, + "step": 3963, + "time_per_iteration": 2.719027280807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072758, + "balance_loss_mlp": 1.05827415, + "epoch": 0.7626010003847634, + "flos": 629132396544.0, + "grad_norm": 0.09597126350862907, + "language_loss": 0.77261025, + "learning_rate": 0.0001406572870354224, + "loss": 0.78333783, + "num_input_tokens_seen": 328820384, + "router_z_loss_mlp": 0.14477539, + "step": 3964, + "time_per_iteration": 2.8318536281585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_mlp": 1.05142951, + "epoch": 0.7627933820700269, + "flos": 437942702592.0, + "grad_norm": 0.06846487833206179, + "language_loss": 0.86648387, + "learning_rate": 0.00014044073042076337, + "loss": 0.87714344, + "num_input_tokens_seen": 328884976, + "router_z_loss_mlp": 0.14501953, + "step": 3965, + "time_per_iteration": 2.5620529651641846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073227, + "balance_loss_mlp": 1.05873156, + "epoch": 0.7629857637552905, + "flos": 532723350528.0, + "grad_norm": 0.08133731345364971, + "language_loss": 0.89009362, + "learning_rate": 0.00014022431339939302, + "loss": 0.90082592, + "num_input_tokens_seen": 328957792, + "router_z_loss_mlp": 0.14489746, + "step": 3966, + "time_per_iteration": 2.6854476928710938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071043, + "balance_loss_mlp": 1.05605876, + "epoch": 0.7631781454405541, + "flos": 680036290560.0, + "grad_norm": 0.08711941543983692, + "language_loss": 0.78104591, + "learning_rate": 0.00014000803605533163, + "loss": 0.79175639, + "num_input_tokens_seen": 329034960, + "router_z_loss_mlp": 0.1496582, + "step": 3967, + "time_per_iteration": 2.83705735206604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077697, + "balance_loss_mlp": 1.06324911, + "epoch": 0.7633705271258177, + "flos": 507493859328.0, + "grad_norm": 0.09829351187117948, + "language_loss": 0.83671606, + "learning_rate": 0.00013979189847254553, + "loss": 0.84749299, + "num_input_tokens_seen": 329100848, + "router_z_loss_mlp": 0.14440918, + "step": 3968, + "time_per_iteration": 2.5781285762786865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075641, + "balance_loss_mlp": 1.0607276, + "epoch": 0.7635629088110811, + "flos": 618866085888.0, + "grad_norm": 0.08084752979294811, + "language_loss": 0.80726254, + "learning_rate": 0.00013957590073494674, + "loss": 0.81801891, + "num_input_tokens_seen": 329181120, + "router_z_loss_mlp": 0.14904785, + "step": 3969, + "time_per_iteration": 2.8175971508026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070147, + "balance_loss_mlp": 1.05593681, + "epoch": 0.7637552904963447, + "flos": 638425193472.0, + "grad_norm": 0.08048508029980411, + "language_loss": 0.78762692, + "learning_rate": 0.0001393600429263931, + "loss": 0.7983284, + "num_input_tokens_seen": 329249888, + "router_z_loss_mlp": 0.14208984, + "step": 3970, + "time_per_iteration": 2.7563693523406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01021274, + "balance_loss_mlp": 1.01469386, + "epoch": 0.7639476721816083, + "flos": 1563222302208.0, + "grad_norm": 0.013272156084273934, + "language_loss": 0.74744886, + "learning_rate": 0.00013914432513068792, + "loss": 0.75766158, + "num_input_tokens_seen": 329483824, + "router_z_loss_mlp": 0.06591797, + "step": 3971, + "time_per_iteration": 4.93863320350647 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067066, + "balance_loss_mlp": 1.05265367, + "epoch": 0.7641400538668719, + "flos": 495987162624.0, + "grad_norm": 0.0632460471594908, + "language_loss": 0.81507617, + "learning_rate": 0.0001389287474315804, + "loss": 0.82574689, + "num_input_tokens_seen": 329553536, + "router_z_loss_mlp": 0.14404297, + "step": 3972, + "time_per_iteration": 2.6393582820892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074461, + "balance_loss_mlp": 1.05995345, + "epoch": 0.7643324355521355, + "flos": 578441046528.0, + "grad_norm": 0.06341816515754745, + "language_loss": 0.80192941, + "learning_rate": 0.00013871330991276505, + "loss": 0.81267405, + "num_input_tokens_seen": 329621856, + "router_z_loss_mlp": 0.1451416, + "step": 3973, + "time_per_iteration": 2.714632987976074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070348, + "balance_loss_mlp": 1.05616236, + "epoch": 0.764524817237399, + "flos": 784823717376.0, + "grad_norm": 0.085490997753428, + "language_loss": 0.80806011, + "learning_rate": 0.00013849801265788247, + "loss": 0.81876361, + "num_input_tokens_seen": 329708192, + "router_z_loss_mlp": 0.14196777, + "step": 3974, + "time_per_iteration": 4.533233880996704 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069636, + "balance_loss_mlp": 1.05451989, + "epoch": 0.7647171989226625, + "flos": 526279514112.0, + "grad_norm": 0.07052252307543246, + "language_loss": 0.8329643, + "learning_rate": 0.00013828285575051818, + "loss": 0.84366071, + "num_input_tokens_seen": 329774704, + "router_z_loss_mlp": 0.15100098, + "step": 3975, + "time_per_iteration": 2.6609082221984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068505, + "balance_loss_mlp": 1.05420017, + "epoch": 0.7649095806079261, + "flos": 554876656128.0, + "grad_norm": 0.07307751300876789, + "language_loss": 0.84132665, + "learning_rate": 0.0001380678392742035, + "loss": 0.85201168, + "num_input_tokens_seen": 329846432, + "router_z_loss_mlp": 0.14306641, + "step": 3976, + "time_per_iteration": 2.804594039916992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065652, + "balance_loss_mlp": 1.05060732, + "epoch": 0.7651019622931897, + "flos": 649145954304.0, + "grad_norm": 0.06679898937130221, + "language_loss": 0.84919113, + "learning_rate": 0.00013785296331241526, + "loss": 0.85984766, + "num_input_tokens_seen": 329926336, + "router_z_loss_mlp": 0.15039062, + "step": 3977, + "time_per_iteration": 2.8787760734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066608, + "balance_loss_mlp": 1.0521363, + "epoch": 0.7652943439784533, + "flos": 1046449248768.0, + "grad_norm": 0.08384386833632657, + "language_loss": 0.87255394, + "learning_rate": 0.00013763822794857583, + "loss": 0.88322002, + "num_input_tokens_seen": 330009536, + "router_z_loss_mlp": 0.14477539, + "step": 3978, + "time_per_iteration": 3.372908115386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063846, + "balance_loss_mlp": 1.04954064, + "epoch": 0.7654867256637168, + "flos": 504350862336.0, + "grad_norm": 0.07264681342413916, + "language_loss": 0.90047956, + "learning_rate": 0.00013742363326605278, + "loss": 0.91111797, + "num_input_tokens_seen": 330083264, + "router_z_loss_mlp": 0.14306641, + "step": 3979, + "time_per_iteration": 2.7111921310424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068001, + "balance_loss_mlp": 1.05338621, + "epoch": 0.7656791073489804, + "flos": 574709976576.0, + "grad_norm": 0.06616632618822393, + "language_loss": 0.78682995, + "learning_rate": 0.00013720917934815935, + "loss": 0.79750991, + "num_input_tokens_seen": 330157120, + "router_z_loss_mlp": 0.14599609, + "step": 3980, + "time_per_iteration": 2.7665858268737793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064513, + "balance_loss_mlp": 1.04985034, + "epoch": 0.765871489034244, + "flos": 492812232192.0, + "grad_norm": 0.0792407009711433, + "language_loss": 0.82811975, + "learning_rate": 0.00013699486627815344, + "loss": 0.83876491, + "num_input_tokens_seen": 330224560, + "router_z_loss_mlp": 0.1463623, + "step": 3981, + "time_per_iteration": 2.5893211364746094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067597, + "balance_loss_mlp": 1.05322027, + "epoch": 0.7660638707195075, + "flos": 486024800256.0, + "grad_norm": 0.06352647599324608, + "language_loss": 0.82432151, + "learning_rate": 0.00013678069413923928, + "loss": 0.83499742, + "num_input_tokens_seen": 330292000, + "router_z_loss_mlp": 0.14379883, + "step": 3982, + "time_per_iteration": 2.6204872131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063042, + "balance_loss_mlp": 1.04862928, + "epoch": 0.766256252404771, + "flos": 444295134720.0, + "grad_norm": 0.07852318401052459, + "language_loss": 0.82138562, + "learning_rate": 0.00013656666301456555, + "loss": 0.83201599, + "num_input_tokens_seen": 330357472, + "router_z_loss_mlp": 0.1439209, + "step": 3983, + "time_per_iteration": 2.5520832538604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065895, + "balance_loss_mlp": 1.05096984, + "epoch": 0.7664486340900346, + "flos": 485179766784.0, + "grad_norm": 0.06488313554531835, + "language_loss": 0.84368253, + "learning_rate": 0.0001363527729872267, + "loss": 0.85434151, + "num_input_tokens_seen": 330427792, + "router_z_loss_mlp": 0.14904785, + "step": 3984, + "time_per_iteration": 2.7092504501342773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065315, + "balance_loss_mlp": 1.05033016, + "epoch": 0.7666410157752982, + "flos": 646200820224.0, + "grad_norm": 0.07270873670315516, + "language_loss": 0.76720321, + "learning_rate": 0.00013613902414026207, + "loss": 0.77785635, + "num_input_tokens_seen": 330500320, + "router_z_loss_mlp": 0.1496582, + "step": 3985, + "time_per_iteration": 2.8448526859283447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065572, + "balance_loss_mlp": 1.05049181, + "epoch": 0.7668333974605618, + "flos": 774303017472.0, + "grad_norm": 0.07569693962897468, + "language_loss": 0.82453251, + "learning_rate": 0.00013592541655665642, + "loss": 0.83518815, + "num_input_tokens_seen": 330581696, + "router_z_loss_mlp": 0.1505127, + "step": 3986, + "time_per_iteration": 3.0181548595428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070428, + "balance_loss_mlp": 1.05580068, + "epoch": 0.7670257791458254, + "flos": 613462574592.0, + "grad_norm": 0.08265865172273029, + "language_loss": 0.85586035, + "learning_rate": 0.00013571195031933947, + "loss": 0.86656457, + "num_input_tokens_seen": 330648000, + "router_z_loss_mlp": 0.14611816, + "step": 3987, + "time_per_iteration": 2.7126588821411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006841, + "balance_loss_mlp": 1.00030851, + "epoch": 0.7672181608310888, + "flos": 1485357378048.0, + "grad_norm": 0.01029491447835557, + "language_loss": 0.80481339, + "learning_rate": 0.00013549862551118626, + "loss": 0.8148818, + "num_input_tokens_seen": 330873872, + "router_z_loss_mlp": 0.06542969, + "step": 3988, + "time_per_iteration": 4.7078423500061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061224, + "balance_loss_mlp": 1.046525, + "epoch": 0.7674105425163524, + "flos": 610732182528.0, + "grad_norm": 0.06747537690646892, + "language_loss": 0.85686624, + "learning_rate": 0.00013528544221501655, + "loss": 0.86747837, + "num_input_tokens_seen": 330945760, + "router_z_loss_mlp": 0.14685059, + "step": 3989, + "time_per_iteration": 2.734370470046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076335, + "balance_loss_mlp": 1.06144583, + "epoch": 0.767602924201616, + "flos": 845205788160.0, + "grad_norm": 0.0637335052103759, + "language_loss": 0.81435496, + "learning_rate": 0.00013507240051359586, + "loss": 0.8251183, + "num_input_tokens_seen": 331025584, + "router_z_loss_mlp": 0.14868164, + "step": 3990, + "time_per_iteration": 3.06548810005188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071514, + "balance_loss_mlp": 1.05666053, + "epoch": 0.7677953058868796, + "flos": 527114635776.0, + "grad_norm": 0.19838733588160684, + "language_loss": 0.85903186, + "learning_rate": 0.00013485950048963425, + "loss": 0.86974698, + "num_input_tokens_seen": 331093008, + "router_z_loss_mlp": 0.1484375, + "step": 3991, + "time_per_iteration": 2.6094348430633545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106551, + "balance_loss_mlp": 1.05060887, + "epoch": 0.7679876875721431, + "flos": 923550501888.0, + "grad_norm": 0.07043981174766001, + "language_loss": 0.82674527, + "learning_rate": 0.00013464674222578643, + "loss": 0.83740032, + "num_input_tokens_seen": 331177120, + "router_z_loss_mlp": 0.14880371, + "step": 3992, + "time_per_iteration": 3.2195329666137695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069425, + "balance_loss_mlp": 1.0544765, + "epoch": 0.7681800692574067, + "flos": 458087311872.0, + "grad_norm": 0.07032959255599644, + "language_loss": 0.83132064, + "learning_rate": 0.00013443412580465292, + "loss": 0.84201485, + "num_input_tokens_seen": 331245424, + "router_z_loss_mlp": 0.14929199, + "step": 3993, + "time_per_iteration": 2.5895824432373047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065769, + "balance_loss_mlp": 1.05068934, + "epoch": 0.7683724509426703, + "flos": 658436179968.0, + "grad_norm": 0.06321728097990122, + "language_loss": 0.83854759, + "learning_rate": 0.00013422165130877857, + "loss": 0.84920526, + "num_input_tokens_seen": 331327504, + "router_z_loss_mlp": 0.15063477, + "step": 3994, + "time_per_iteration": 2.925792932510376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069232, + "balance_loss_mlp": 1.05415177, + "epoch": 0.7685648326279338, + "flos": 555284491776.0, + "grad_norm": 0.07271740437502876, + "language_loss": 0.80652654, + "learning_rate": 0.00013400931882065327, + "loss": 0.8172189, + "num_input_tokens_seen": 331398464, + "router_z_loss_mlp": 0.15063477, + "step": 3995, + "time_per_iteration": 2.6709957122802734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065338, + "balance_loss_mlp": 1.0499239, + "epoch": 0.7687572143131974, + "flos": 687404081664.0, + "grad_norm": 0.06876581607663422, + "language_loss": 0.81030929, + "learning_rate": 0.0001337971284227118, + "loss": 0.82096267, + "num_input_tokens_seen": 331484592, + "router_z_loss_mlp": 0.15393066, + "step": 3996, + "time_per_iteration": 3.056353807449341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008904, + "balance_loss_mlp": 1.00222826, + "epoch": 0.7689495959984609, + "flos": 1489453691904.0, + "grad_norm": 0.013387325374254085, + "language_loss": 0.76118422, + "learning_rate": 0.00013358508019733388, + "loss": 0.77127326, + "num_input_tokens_seen": 331721360, + "router_z_loss_mlp": 0.06689453, + "step": 3997, + "time_per_iteration": 4.957718133926392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064729, + "balance_loss_mlp": 1.04914832, + "epoch": 0.7691419776837245, + "flos": 570405888000.0, + "grad_norm": 0.06293795645279736, + "language_loss": 0.80514187, + "learning_rate": 0.0001333731742268438, + "loss": 0.81578922, + "num_input_tokens_seen": 331794240, + "router_z_loss_mlp": 0.15576172, + "step": 3998, + "time_per_iteration": 2.712575674057007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063537, + "balance_loss_mlp": 1.04812336, + "epoch": 0.7693343593689881, + "flos": 520087495680.0, + "grad_norm": 0.06867525176596115, + "language_loss": 0.85581779, + "learning_rate": 0.0001331614105935109, + "loss": 0.86645317, + "num_input_tokens_seen": 331866496, + "router_z_loss_mlp": 0.15393066, + "step": 3999, + "time_per_iteration": 2.7334744930267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061501, + "balance_loss_mlp": 1.04600382, + "epoch": 0.7695267410542517, + "flos": 660378438144.0, + "grad_norm": 0.06588382908784379, + "language_loss": 0.84056103, + "learning_rate": 0.00013294978937954883, + "loss": 0.85117608, + "num_input_tokens_seen": 331936592, + "router_z_loss_mlp": 0.15490723, + "step": 4000, + "time_per_iteration": 2.8713667392730713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_mlp": 1.04591429, + "epoch": 0.7697191227395151, + "flos": 546809564160.0, + "grad_norm": 0.11170111036218774, + "language_loss": 0.85502183, + "learning_rate": 0.00013273831066711655, + "loss": 0.86563516, + "num_input_tokens_seen": 332003536, + "router_z_loss_mlp": 0.15393066, + "step": 4001, + "time_per_iteration": 2.674727201461792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059012, + "balance_loss_mlp": 1.04409897, + "epoch": 0.7699115044247787, + "flos": 540610205184.0, + "grad_norm": 0.06526458774457519, + "language_loss": 0.80125463, + "learning_rate": 0.00013252697453831747, + "loss": 0.81184471, + "num_input_tokens_seen": 332075248, + "router_z_loss_mlp": 0.14880371, + "step": 4002, + "time_per_iteration": 2.7256710529327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063942, + "balance_loss_mlp": 1.04832566, + "epoch": 0.7701038861100423, + "flos": 562936407552.0, + "grad_norm": 0.06842053131152107, + "language_loss": 0.82420772, + "learning_rate": 0.00013231578107519916, + "loss": 0.83484715, + "num_input_tokens_seen": 332158944, + "router_z_loss_mlp": 0.15600586, + "step": 4003, + "time_per_iteration": 2.9035251140594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106275, + "balance_loss_mlp": 1.04752731, + "epoch": 0.7702962677953059, + "flos": 481737964032.0, + "grad_norm": 0.07973091789387209, + "language_loss": 0.82878852, + "learning_rate": 0.00013210473035975422, + "loss": 0.83941609, + "num_input_tokens_seen": 332226368, + "router_z_loss_mlp": 0.15209961, + "step": 4004, + "time_per_iteration": 2.628084182739258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010592, + "balance_loss_mlp": 1.04378664, + "epoch": 0.7704886494805695, + "flos": 770389138944.0, + "grad_norm": 0.08630684221581464, + "language_loss": 0.85682714, + "learning_rate": 0.0001318938224739201, + "loss": 0.86741912, + "num_input_tokens_seen": 332314784, + "router_z_loss_mlp": 0.15393066, + "step": 4005, + "time_per_iteration": 3.1021761894226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061315, + "balance_loss_mlp": 1.04588926, + "epoch": 0.770681031165833, + "flos": 601192336896.0, + "grad_norm": 0.06315324541354835, + "language_loss": 0.83698732, + "learning_rate": 0.00013168305749957843, + "loss": 0.84760046, + "num_input_tokens_seen": 332387952, + "router_z_loss_mlp": 0.1541748, + "step": 4006, + "time_per_iteration": 2.8380637168884277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061238, + "balance_loss_mlp": 1.04613376, + "epoch": 0.7708734128510966, + "flos": 496108302336.0, + "grad_norm": 0.07282324785530167, + "language_loss": 0.82726502, + "learning_rate": 0.00013147243551855532, + "loss": 0.83787745, + "num_input_tokens_seen": 332456352, + "router_z_loss_mlp": 0.15075684, + "step": 4007, + "time_per_iteration": 2.6003365516662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064008, + "balance_loss_mlp": 1.04878509, + "epoch": 0.7710657945363601, + "flos": 567299966976.0, + "grad_norm": 0.07719085419162308, + "language_loss": 0.80652189, + "learning_rate": 0.00013126195661262148, + "loss": 0.81716192, + "num_input_tokens_seen": 332534288, + "router_z_loss_mlp": 0.15209961, + "step": 4008, + "time_per_iteration": 2.762053966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066486, + "balance_loss_mlp": 1.05129838, + "epoch": 0.7712581762216237, + "flos": 604550075904.0, + "grad_norm": 0.07418966803723698, + "language_loss": 0.86903155, + "learning_rate": 0.00013105162086349216, + "loss": 0.87969637, + "num_input_tokens_seen": 332615440, + "router_z_loss_mlp": 0.15161133, + "step": 4009, + "time_per_iteration": 2.8321642875671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060916, + "balance_loss_mlp": 1.04613364, + "epoch": 0.7714505579068872, + "flos": 530894891520.0, + "grad_norm": 0.07303373639120146, + "language_loss": 0.8590073, + "learning_rate": 0.00013084142835282687, + "loss": 0.86961645, + "num_input_tokens_seen": 332687360, + "router_z_loss_mlp": 0.14770508, + "step": 4010, + "time_per_iteration": 2.7242491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01005899, + "balance_loss_mlp": 0.9993664, + "epoch": 0.7716429395921508, + "flos": 1422205267968.0, + "grad_norm": 0.003197258642765861, + "language_loss": 0.79884362, + "learning_rate": 0.00013063137916222956, + "loss": 0.80890262, + "num_input_tokens_seen": 332919936, + "router_z_loss_mlp": 0.06542969, + "step": 4011, + "time_per_iteration": 4.762616395950317 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106388, + "balance_loss_mlp": 1.04903793, + "epoch": 0.7718353212774144, + "flos": 578428563456.0, + "grad_norm": 0.08194546236645563, + "language_loss": 0.89672923, + "learning_rate": 0.0001304214733732485, + "loss": 0.90736794, + "num_input_tokens_seen": 332990096, + "router_z_loss_mlp": 0.1484375, + "step": 4012, + "time_per_iteration": 2.7599334716796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065761, + "balance_loss_mlp": 1.05091929, + "epoch": 0.772027702962678, + "flos": 510742941696.0, + "grad_norm": 0.07424002912728798, + "language_loss": 0.82715225, + "learning_rate": 0.00013021171106737672, + "loss": 0.83780992, + "num_input_tokens_seen": 333063616, + "router_z_loss_mlp": 0.14831543, + "step": 4013, + "time_per_iteration": 2.6886706352233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063572, + "balance_loss_mlp": 1.04939795, + "epoch": 0.7722200846479416, + "flos": 525661705728.0, + "grad_norm": 0.05840576732821659, + "language_loss": 0.79845583, + "learning_rate": 0.00013000209232605071, + "loss": 0.80909157, + "num_input_tokens_seen": 333136368, + "router_z_loss_mlp": 0.1418457, + "step": 4014, + "time_per_iteration": 2.687988519668579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069879, + "balance_loss_mlp": 1.05541873, + "epoch": 0.772412466333205, + "flos": 479598216192.0, + "grad_norm": 0.07708984464094068, + "language_loss": 0.79761243, + "learning_rate": 0.0001297926172306519, + "loss": 0.80831122, + "num_input_tokens_seen": 333207136, + "router_z_loss_mlp": 0.14440918, + "step": 4015, + "time_per_iteration": 2.691276788711548 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071175, + "balance_loss_mlp": 1.05634522, + "epoch": 0.7726048480184686, + "flos": 905688801792.0, + "grad_norm": 0.0617812543483069, + "language_loss": 0.78855252, + "learning_rate": 0.0001295832858625055, + "loss": 0.79926431, + "num_input_tokens_seen": 333291920, + "router_z_loss_mlp": 0.14807129, + "step": 4016, + "time_per_iteration": 3.2806596755981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072517, + "balance_loss_mlp": 1.05814075, + "epoch": 0.7727972297037322, + "flos": 631380801024.0, + "grad_norm": 0.10339069481740779, + "language_loss": 0.69680643, + "learning_rate": 0.00012937409830288154, + "loss": 0.70753151, + "num_input_tokens_seen": 333369824, + "router_z_loss_mlp": 0.14367676, + "step": 4017, + "time_per_iteration": 2.863893508911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075198, + "balance_loss_mlp": 1.0609529, + "epoch": 0.7729896113889958, + "flos": 414786147840.0, + "grad_norm": 0.0799045104942487, + "language_loss": 0.85132849, + "learning_rate": 0.00012916505463299362, + "loss": 0.86208045, + "num_input_tokens_seen": 333434192, + "router_z_loss_mlp": 0.14233398, + "step": 4018, + "time_per_iteration": 2.532130002975464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073187, + "balance_loss_mlp": 1.05844092, + "epoch": 0.7731819930742593, + "flos": 668907694080.0, + "grad_norm": 0.09414519746136404, + "language_loss": 0.77866244, + "learning_rate": 0.00012895615493399972, + "loss": 0.78939426, + "num_input_tokens_seen": 333509696, + "router_z_loss_mlp": 0.14733887, + "step": 4019, + "time_per_iteration": 2.839327812194824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073965, + "balance_loss_mlp": 1.05939734, + "epoch": 0.7733743747595229, + "flos": 489854615040.0, + "grad_norm": 0.14078532910338418, + "language_loss": 0.82467055, + "learning_rate": 0.00012874739928700192, + "loss": 0.83541024, + "num_input_tokens_seen": 333575184, + "router_z_loss_mlp": 0.14562988, + "step": 4020, + "time_per_iteration": 2.596458911895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066633, + "balance_loss_mlp": 1.05195868, + "epoch": 0.7735667564447865, + "flos": 659612325888.0, + "grad_norm": 0.07681934826455636, + "language_loss": 0.79637134, + "learning_rate": 0.00012853878777304624, + "loss": 0.80703765, + "num_input_tokens_seen": 333651568, + "router_z_loss_mlp": 0.14660645, + "step": 4021, + "time_per_iteration": 2.881782054901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080593, + "balance_loss_mlp": 1.0660851, + "epoch": 0.77375913813005, + "flos": 533383004160.0, + "grad_norm": 0.05945562457109584, + "language_loss": 0.84455419, + "learning_rate": 0.000128330320473123, + "loss": 0.85536003, + "num_input_tokens_seen": 333726400, + "router_z_loss_mlp": 0.14489746, + "step": 4022, + "time_per_iteration": 2.7595038414001465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102275, + "balance_loss_mlp": 1.01607394, + "epoch": 0.7739515198153136, + "flos": 1520081925120.0, + "grad_norm": 0.012779532981729017, + "language_loss": 0.783319, + "learning_rate": 0.00012812199746816628, + "loss": 0.7935465, + "num_input_tokens_seen": 333960224, + "router_z_loss_mlp": 0.06689453, + "step": 4023, + "time_per_iteration": 4.909268379211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079749, + "balance_loss_mlp": 1.06491959, + "epoch": 0.7741439015005771, + "flos": 640105348608.0, + "grad_norm": 0.0771739695841244, + "language_loss": 0.81660879, + "learning_rate": 0.0001279138188390543, + "loss": 0.82740629, + "num_input_tokens_seen": 334033904, + "router_z_loss_mlp": 0.14807129, + "step": 4024, + "time_per_iteration": 2.8041296005249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072864, + "balance_loss_mlp": 1.05869019, + "epoch": 0.7743362831858407, + "flos": 665841420288.0, + "grad_norm": 0.05641860086988057, + "language_loss": 0.86285681, + "learning_rate": 0.00012770578466660915, + "loss": 0.87358546, + "num_input_tokens_seen": 334107904, + "router_z_loss_mlp": 0.1418457, + "step": 4025, + "time_per_iteration": 2.8959219455718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076822, + "balance_loss_mlp": 1.06208789, + "epoch": 0.7745286648711043, + "flos": 562760939520.0, + "grad_norm": 0.06540295848056549, + "language_loss": 0.8125031, + "learning_rate": 0.0001274978950315968, + "loss": 0.82327133, + "num_input_tokens_seen": 334184048, + "router_z_loss_mlp": 0.14709473, + "step": 4026, + "time_per_iteration": 2.8482625484466553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078061, + "balance_loss_mlp": 1.06332707, + "epoch": 0.7747210465563679, + "flos": 516912565248.0, + "grad_norm": 0.20054905129576697, + "language_loss": 0.83055073, + "learning_rate": 0.00012729015001472716, + "loss": 0.84133136, + "num_input_tokens_seen": 334257152, + "router_z_loss_mlp": 0.14709473, + "step": 4027, + "time_per_iteration": 2.660585641860962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076692, + "balance_loss_mlp": 1.06191039, + "epoch": 0.7749134282416313, + "flos": 634209937920.0, + "grad_norm": 0.06859872536525731, + "language_loss": 0.81346893, + "learning_rate": 0.00012708254969665418, + "loss": 0.82423586, + "num_input_tokens_seen": 334331312, + "router_z_loss_mlp": 0.14770508, + "step": 4028, + "time_per_iteration": 2.755984306335449 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087654, + "balance_loss_mlp": 1.07344401, + "epoch": 0.7751058099268949, + "flos": 495364584960.0, + "grad_norm": 0.13856653823900703, + "language_loss": 0.83200014, + "learning_rate": 0.00012687509415797526, + "loss": 0.84287679, + "num_input_tokens_seen": 334397344, + "router_z_loss_mlp": 0.14208984, + "step": 4029, + "time_per_iteration": 2.605494976043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081118, + "balance_loss_mlp": 1.0666815, + "epoch": 0.7752981916121585, + "flos": 510310513152.0, + "grad_norm": 0.07842880902840609, + "language_loss": 0.81172287, + "learning_rate": 0.00012666778347923208, + "loss": 0.82253402, + "num_input_tokens_seen": 334467872, + "router_z_loss_mlp": 0.14428711, + "step": 4030, + "time_per_iteration": 2.7449951171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082077, + "balance_loss_mlp": 1.06805801, + "epoch": 0.7754905732974221, + "flos": 497548749312.0, + "grad_norm": 0.06532931928318482, + "language_loss": 0.83712244, + "learning_rate": 0.0001264606177409092, + "loss": 0.84794319, + "num_input_tokens_seen": 334539088, + "router_z_loss_mlp": 0.14025879, + "step": 4031, + "time_per_iteration": 2.654155731201172 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078609, + "balance_loss_mlp": 1.06400609, + "epoch": 0.7756829549826857, + "flos": 480744626688.0, + "grad_norm": 0.0713548804544701, + "language_loss": 0.85789335, + "learning_rate": 0.00012625359702343609, + "loss": 0.8686794, + "num_input_tokens_seen": 334612576, + "router_z_loss_mlp": 0.14587402, + "step": 4032, + "time_per_iteration": 2.7373740673065186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082548, + "balance_loss_mlp": 1.06805241, + "epoch": 0.7758753366679492, + "flos": 552630822912.0, + "grad_norm": 0.0791790150360774, + "language_loss": 0.85047174, + "learning_rate": 0.00012604672140718504, + "loss": 0.86129719, + "num_input_tokens_seen": 334677824, + "router_z_loss_mlp": 0.14477539, + "step": 4033, + "time_per_iteration": 2.668175458908081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080956, + "balance_loss_mlp": 1.06637716, + "epoch": 0.7760677183532128, + "flos": 703835246592.0, + "grad_norm": 0.07723618035989119, + "language_loss": 0.77780712, + "learning_rate": 0.00012583999097247233, + "loss": 0.78861672, + "num_input_tokens_seen": 334751456, + "router_z_loss_mlp": 0.14562988, + "step": 4034, + "time_per_iteration": 2.882200241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082263, + "balance_loss_mlp": 1.06798124, + "epoch": 0.7762601000384763, + "flos": 523470200832.0, + "grad_norm": 0.07383461376071596, + "language_loss": 0.79689777, + "learning_rate": 0.0001256334057995578, + "loss": 0.80772036, + "num_input_tokens_seen": 334823008, + "router_z_loss_mlp": 0.1427002, + "step": 4035, + "time_per_iteration": 2.690647602081299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080186, + "balance_loss_mlp": 1.06548703, + "epoch": 0.7764524817237399, + "flos": 557532896256.0, + "grad_norm": 0.06987700123133081, + "language_loss": 0.84825015, + "learning_rate": 0.000125426965968645, + "loss": 0.85905206, + "num_input_tokens_seen": 334896048, + "router_z_loss_mlp": 0.14672852, + "step": 4036, + "time_per_iteration": 2.7032387256622314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077648, + "balance_loss_mlp": 1.06333125, + "epoch": 0.7766448634090035, + "flos": 579725849088.0, + "grad_norm": 0.07584247784389492, + "language_loss": 0.82193661, + "learning_rate": 0.00012522067155988092, + "loss": 0.83271313, + "num_input_tokens_seen": 334964416, + "router_z_loss_mlp": 0.14306641, + "step": 4037, + "time_per_iteration": 2.6950039863586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107402, + "balance_loss_mlp": 1.05985785, + "epoch": 0.776837245094267, + "flos": 635603397120.0, + "grad_norm": 0.09463172891349511, + "language_loss": 0.75239801, + "learning_rate": 0.00012501452265335617, + "loss": 0.76313818, + "num_input_tokens_seen": 335043360, + "router_z_loss_mlp": 0.14160156, + "step": 4038, + "time_per_iteration": 2.8472111225128174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107878, + "balance_loss_mlp": 1.06440353, + "epoch": 0.7770296267795306, + "flos": 614680565760.0, + "grad_norm": 0.06689469876162565, + "language_loss": 0.82871956, + "learning_rate": 0.0001248085193291047, + "loss": 0.83950734, + "num_input_tokens_seen": 335113216, + "router_z_loss_mlp": 0.14367676, + "step": 4039, + "time_per_iteration": 2.750570774078369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078863, + "balance_loss_mlp": 1.06394982, + "epoch": 0.7772220084647942, + "flos": 878808890880.0, + "grad_norm": 0.07053894567576345, + "language_loss": 0.82192504, + "learning_rate": 0.00012460266166710443, + "loss": 0.83271372, + "num_input_tokens_seen": 335195824, + "router_z_loss_mlp": 0.14904785, + "step": 4040, + "time_per_iteration": 3.2112436294555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072892, + "balance_loss_mlp": 1.05816936, + "epoch": 0.7774143901500578, + "flos": 839641489920.0, + "grad_norm": 0.07497892804432345, + "language_loss": 0.77567667, + "learning_rate": 0.00012439694974727633, + "loss": 0.78640562, + "num_input_tokens_seen": 335269712, + "router_z_loss_mlp": 0.14709473, + "step": 4041, + "time_per_iteration": 3.0245847702026367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073829, + "balance_loss_mlp": 1.05928516, + "epoch": 0.7776067718353212, + "flos": 568147571712.0, + "grad_norm": 0.060778076855285974, + "language_loss": 0.79776394, + "learning_rate": 0.00012419138364948458, + "loss": 0.8085022, + "num_input_tokens_seen": 335343408, + "router_z_loss_mlp": 0.14538574, + "step": 4042, + "time_per_iteration": 2.7336411476135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066625, + "balance_loss_mlp": 1.05183077, + "epoch": 0.7777991535205848, + "flos": 745943012352.0, + "grad_norm": 0.06729651648033357, + "language_loss": 0.82573462, + "learning_rate": 0.00012398596345353702, + "loss": 0.83640087, + "num_input_tokens_seen": 335415360, + "router_z_loss_mlp": 0.14770508, + "step": 4043, + "time_per_iteration": 2.888540029525757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072811, + "balance_loss_mlp": 1.05832708, + "epoch": 0.7779915352058484, + "flos": 538075104768.0, + "grad_norm": 0.06360284463986167, + "language_loss": 0.83389121, + "learning_rate": 0.0001237806892391851, + "loss": 0.84461933, + "num_input_tokens_seen": 335491568, + "router_z_loss_mlp": 0.14489746, + "step": 4044, + "time_per_iteration": 2.745943546295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.05827904, + "epoch": 0.778183916891112, + "flos": 634788099072.0, + "grad_norm": 0.07557014389374586, + "language_loss": 0.80569065, + "learning_rate": 0.0001235755610861233, + "loss": 0.81642056, + "num_input_tokens_seen": 335567200, + "router_z_loss_mlp": 0.14685059, + "step": 4045, + "time_per_iteration": 2.8391878604888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073322, + "balance_loss_mlp": 1.05836082, + "epoch": 0.7783762985763756, + "flos": 588677621760.0, + "grad_norm": 0.08271633587976211, + "language_loss": 0.84886134, + "learning_rate": 0.0001233705790739893, + "loss": 0.85959458, + "num_input_tokens_seen": 335640512, + "router_z_loss_mlp": 0.14941406, + "step": 4046, + "time_per_iteration": 2.7301955223083496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070585, + "balance_loss_mlp": 1.05619669, + "epoch": 0.7785686802616391, + "flos": 930656563200.0, + "grad_norm": 0.07709409005439366, + "language_loss": 0.75105876, + "learning_rate": 0.0001231657432823643, + "loss": 0.76176465, + "num_input_tokens_seen": 335726016, + "router_z_loss_mlp": 0.14379883, + "step": 4047, + "time_per_iteration": 3.2447426319122314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072293, + "balance_loss_mlp": 1.05745101, + "epoch": 0.7787610619469026, + "flos": 497934190080.0, + "grad_norm": 0.08319783109308485, + "language_loss": 0.78652561, + "learning_rate": 0.0001229610537907725, + "loss": 0.79724848, + "num_input_tokens_seen": 335794864, + "router_z_loss_mlp": 0.14819336, + "step": 4048, + "time_per_iteration": 2.6655571460723877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071474, + "balance_loss_mlp": 1.05683541, + "epoch": 0.7789534436321662, + "flos": 515637674496.0, + "grad_norm": 0.1398744785443317, + "language_loss": 0.90141088, + "learning_rate": 0.00012275651067868143, + "loss": 0.91212559, + "num_input_tokens_seen": 335860928, + "router_z_loss_mlp": 0.14624023, + "step": 4049, + "time_per_iteration": 2.660238265991211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072645, + "balance_loss_mlp": 1.05813682, + "epoch": 0.7791458253174298, + "flos": 988476369408.0, + "grad_norm": 0.057540222114583386, + "language_loss": 0.8025769, + "learning_rate": 0.00012255211402550182, + "loss": 0.81330329, + "num_input_tokens_seen": 335945728, + "router_z_loss_mlp": 0.14477539, + "step": 4050, + "time_per_iteration": 3.233478546142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073902, + "balance_loss_mlp": 1.05907226, + "epoch": 0.7793382070026933, + "flos": 629040992256.0, + "grad_norm": 0.08623870329629198, + "language_loss": 0.76389378, + "learning_rate": 0.00012234786391058727, + "loss": 0.77463281, + "num_input_tokens_seen": 336014848, + "router_z_loss_mlp": 0.14819336, + "step": 4051, + "time_per_iteration": 2.771480083465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078975, + "balance_loss_mlp": 1.06439614, + "epoch": 0.7795305886879569, + "flos": 531752408064.0, + "grad_norm": 0.08444624617327998, + "language_loss": 0.84906709, + "learning_rate": 0.0001221437604132352, + "loss": 0.85985684, + "num_input_tokens_seen": 336080096, + "router_z_loss_mlp": 0.14575195, + "step": 4052, + "time_per_iteration": 2.6185004711151123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074724, + "balance_loss_mlp": 1.05997825, + "epoch": 0.7797229703732205, + "flos": 611979909120.0, + "grad_norm": 0.08471537445823431, + "language_loss": 0.8108837, + "learning_rate": 0.0001219398036126852, + "loss": 0.82163101, + "num_input_tokens_seen": 336154640, + "router_z_loss_mlp": 0.1472168, + "step": 4053, + "time_per_iteration": 4.269315004348755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078474, + "balance_loss_mlp": 1.06376374, + "epoch": 0.7799153520584841, + "flos": 872164620288.0, + "grad_norm": 0.0665397662082905, + "language_loss": 0.78063762, + "learning_rate": 0.00012173599358812027, + "loss": 0.79142237, + "num_input_tokens_seen": 336244160, + "router_z_loss_mlp": 0.14685059, + "step": 4054, + "time_per_iteration": 3.3110597133636475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082538, + "balance_loss_mlp": 1.06795871, + "epoch": 0.7801077337437476, + "flos": 583627244544.0, + "grad_norm": 0.07210936675879133, + "language_loss": 0.8279568, + "learning_rate": 0.0001215323304186668, + "loss": 0.83878219, + "num_input_tokens_seen": 336317936, + "router_z_loss_mlp": 0.14575195, + "step": 4055, + "time_per_iteration": 2.8330674171447754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086694, + "balance_loss_mlp": 1.07248473, + "epoch": 0.7803001154290111, + "flos": 601165172736.0, + "grad_norm": 0.07177144261981991, + "language_loss": 0.87391448, + "learning_rate": 0.00012132881418339364, + "loss": 0.88478148, + "num_input_tokens_seen": 336389504, + "router_z_loss_mlp": 0.14196777, + "step": 4056, + "time_per_iteration": 2.776947259902954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_mlp": 1.02952027, + "epoch": 0.7804924971142747, + "flos": 1479577591296.0, + "grad_norm": 0.02528916030641435, + "language_loss": 0.77517563, + "learning_rate": 0.00012112544496131306, + "loss": 0.78553808, + "num_input_tokens_seen": 336615536, + "router_z_loss_mlp": 0.06738281, + "step": 4057, + "time_per_iteration": 4.90228271484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083986, + "balance_loss_mlp": 1.06952608, + "epoch": 0.7806848787995383, + "flos": 630362870784.0, + "grad_norm": 0.06952403466648098, + "language_loss": 0.76993859, + "learning_rate": 0.00012092222283137944, + "loss": 0.78077847, + "num_input_tokens_seen": 336686400, + "router_z_loss_mlp": 0.14453125, + "step": 4058, + "time_per_iteration": 2.8027281761169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_mlp": 1.02595437, + "epoch": 0.7808772604848019, + "flos": 1417587319296.0, + "grad_norm": 0.023618595086734803, + "language_loss": 0.7890631, + "learning_rate": 0.00012071914787249111, + "loss": 0.79938942, + "num_input_tokens_seen": 336912704, + "router_z_loss_mlp": 0.06689453, + "step": 4059, + "time_per_iteration": 4.777599811553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086286, + "balance_loss_mlp": 1.07214832, + "epoch": 0.7810696421700654, + "flos": 731696011776.0, + "grad_norm": 0.06473679884808177, + "language_loss": 0.83483815, + "learning_rate": 0.00012051622016348856, + "loss": 0.84570104, + "num_input_tokens_seen": 336997040, + "router_z_loss_mlp": 0.14135742, + "step": 4060, + "time_per_iteration": 3.0805013179779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082313, + "balance_loss_mlp": 1.0681268, + "epoch": 0.781262023855329, + "flos": 424941230592.0, + "grad_norm": 0.07665957086955441, + "language_loss": 0.84603846, + "learning_rate": 0.00012031343978315539, + "loss": 0.85686159, + "num_input_tokens_seen": 337059760, + "router_z_loss_mlp": 0.14208984, + "step": 4061, + "time_per_iteration": 2.509371280670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079981, + "balance_loss_mlp": 1.06562829, + "epoch": 0.7814544055405925, + "flos": 501027628032.0, + "grad_norm": 0.11716755196941751, + "language_loss": 0.82515299, + "learning_rate": 0.00012011080681021774, + "loss": 0.83595276, + "num_input_tokens_seen": 337128528, + "router_z_loss_mlp": 0.14355469, + "step": 4062, + "time_per_iteration": 2.653513193130493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084404, + "balance_loss_mlp": 1.0701232, + "epoch": 0.7816467872258561, + "flos": 462448300032.0, + "grad_norm": 0.06950633997018366, + "language_loss": 0.86157346, + "learning_rate": 0.00011990832132334512, + "loss": 0.87241757, + "num_input_tokens_seen": 337194112, + "router_z_loss_mlp": 0.14282227, + "step": 4063, + "time_per_iteration": 2.5633385181427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107598, + "balance_loss_mlp": 1.06134164, + "epoch": 0.7818391689111197, + "flos": 740818483200.0, + "grad_norm": 0.08193675337903113, + "language_loss": 0.82464862, + "learning_rate": 0.00011970598340114897, + "loss": 0.83540839, + "num_input_tokens_seen": 337270416, + "router_z_loss_mlp": 0.14624023, + "step": 4064, + "time_per_iteration": 2.978691339492798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075693, + "balance_loss_mlp": 1.06104183, + "epoch": 0.7820315505963832, + "flos": 547669278720.0, + "grad_norm": 0.07485411860694487, + "language_loss": 0.84175539, + "learning_rate": 0.00011950379312218396, + "loss": 0.85251236, + "num_input_tokens_seen": 337343024, + "router_z_loss_mlp": 0.1463623, + "step": 4065, + "time_per_iteration": 2.743990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077394, + "balance_loss_mlp": 1.06274307, + "epoch": 0.7822239322816468, + "flos": 728983245312.0, + "grad_norm": 0.06405873824193194, + "language_loss": 0.86273229, + "learning_rate": 0.00011930175056494719, + "loss": 0.87350619, + "num_input_tokens_seen": 337417232, + "router_z_loss_mlp": 0.1463623, + "step": 4066, + "time_per_iteration": 2.880624532699585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077235, + "balance_loss_mlp": 1.06242979, + "epoch": 0.7824163139669104, + "flos": 452016433152.0, + "grad_norm": 0.05775885887204321, + "language_loss": 0.75816822, + "learning_rate": 0.00011909985580787885, + "loss": 0.76894057, + "num_input_tokens_seen": 337488224, + "router_z_loss_mlp": 0.14794922, + "step": 4067, + "time_per_iteration": 2.6789603233337402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071164, + "balance_loss_mlp": 1.05672777, + "epoch": 0.782608695652174, + "flos": 540489065472.0, + "grad_norm": 0.06284042088337013, + "language_loss": 0.81289232, + "learning_rate": 0.00011889810892936137, + "loss": 0.82360399, + "num_input_tokens_seen": 337564928, + "router_z_loss_mlp": 0.14428711, + "step": 4068, + "time_per_iteration": 2.7376155853271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107855, + "balance_loss_mlp": 1.0636723, + "epoch": 0.7828010773374374, + "flos": 500308503552.0, + "grad_norm": 0.07211764568585548, + "language_loss": 0.77206087, + "learning_rate": 0.00011869651000771959, + "loss": 0.78284639, + "num_input_tokens_seen": 337641632, + "router_z_loss_mlp": 0.1484375, + "step": 4069, + "time_per_iteration": 2.85400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076002, + "balance_loss_mlp": 1.06167328, + "epoch": 0.782993459022701, + "flos": 600816807936.0, + "grad_norm": 0.06878922071462945, + "language_loss": 0.82603711, + "learning_rate": 0.00011849505912122117, + "loss": 0.83679712, + "num_input_tokens_seen": 337711968, + "router_z_loss_mlp": 0.14318848, + "step": 4070, + "time_per_iteration": 2.7624692916870117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073656, + "balance_loss_mlp": 1.05892146, + "epoch": 0.7831858407079646, + "flos": 810055779840.0, + "grad_norm": 0.07956596885242023, + "language_loss": 0.77556145, + "learning_rate": 0.00011829375634807654, + "loss": 0.78629792, + "num_input_tokens_seen": 337795792, + "router_z_loss_mlp": 0.1472168, + "step": 4071, + "time_per_iteration": 3.049309015274048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107043, + "balance_loss_mlp": 1.05580282, + "epoch": 0.7833782223932282, + "flos": 806594153472.0, + "grad_norm": 0.06202372733379216, + "language_loss": 0.81076932, + "learning_rate": 0.00011809260176643821, + "loss": 0.8214736, + "num_input_tokens_seen": 337875584, + "router_z_loss_mlp": 0.14599609, + "step": 4072, + "time_per_iteration": 3.1130549907684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078344, + "balance_loss_mlp": 1.06361008, + "epoch": 0.7835706040784918, + "flos": 520870860288.0, + "grad_norm": 0.09346858465920099, + "language_loss": 0.8374989, + "learning_rate": 0.00011789159545440131, + "loss": 0.84828234, + "num_input_tokens_seen": 337942304, + "router_z_loss_mlp": 0.14709473, + "step": 4073, + "time_per_iteration": 2.602320909500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078791, + "balance_loss_mlp": 1.06415224, + "epoch": 0.7837629857637552, + "flos": 505605929472.0, + "grad_norm": 0.05972390006809772, + "language_loss": 0.82226318, + "learning_rate": 0.00011769073749000348, + "loss": 0.83305109, + "num_input_tokens_seen": 338020864, + "router_z_loss_mlp": 0.14624023, + "step": 4074, + "time_per_iteration": 2.808209180831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073655, + "balance_loss_mlp": 1.05863476, + "epoch": 0.7839553674490188, + "flos": 516124431360.0, + "grad_norm": 0.07654822169545344, + "language_loss": 0.76011252, + "learning_rate": 0.0001174900279512246, + "loss": 0.77084911, + "num_input_tokens_seen": 338089584, + "router_z_loss_mlp": 0.14990234, + "step": 4075, + "time_per_iteration": 2.6128828525543213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071701, + "balance_loss_mlp": 1.05739617, + "epoch": 0.7841477491342824, + "flos": 506648825856.0, + "grad_norm": 0.06466128589052426, + "language_loss": 0.81886286, + "learning_rate": 0.00011728946691598707, + "loss": 0.82957983, + "num_input_tokens_seen": 338159568, + "router_z_loss_mlp": 0.14318848, + "step": 4076, + "time_per_iteration": 2.660953998565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078769, + "balance_loss_mlp": 1.06449986, + "epoch": 0.784340130819546, + "flos": 719636120064.0, + "grad_norm": 0.09310549739723947, + "language_loss": 0.76184124, + "learning_rate": 0.00011708905446215561, + "loss": 0.77262896, + "num_input_tokens_seen": 338233952, + "router_z_loss_mlp": 0.1427002, + "step": 4077, + "time_per_iteration": 2.8871099948883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072579, + "balance_loss_mlp": 1.05803514, + "epoch": 0.7845325125048095, + "flos": 514441704960.0, + "grad_norm": 0.06079440348855826, + "language_loss": 0.80103385, + "learning_rate": 0.00011688879066753711, + "loss": 0.81175959, + "num_input_tokens_seen": 338309568, + "router_z_loss_mlp": 0.14526367, + "step": 4078, + "time_per_iteration": 2.7004237174987793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075184, + "balance_loss_mlp": 1.06087887, + "epoch": 0.7847248941900731, + "flos": 466102646784.0, + "grad_norm": 0.08023192090613442, + "language_loss": 0.87211287, + "learning_rate": 0.00011668867560988122, + "loss": 0.88286471, + "num_input_tokens_seen": 338375920, + "router_z_loss_mlp": 0.14294434, + "step": 4079, + "time_per_iteration": 2.6138765811920166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079594, + "balance_loss_mlp": 1.06459749, + "epoch": 0.7849172758753367, + "flos": 503028983808.0, + "grad_norm": 0.07587541015250795, + "language_loss": 0.84325242, + "learning_rate": 0.00011648870936687916, + "loss": 0.85404837, + "num_input_tokens_seen": 338452208, + "router_z_loss_mlp": 0.14978027, + "step": 4080, + "time_per_iteration": 2.829251766204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074534, + "balance_loss_mlp": 1.05989528, + "epoch": 0.7851096575606002, + "flos": 531999456768.0, + "grad_norm": 0.11404502533109409, + "language_loss": 0.78844041, + "learning_rate": 0.00011628889201616461, + "loss": 0.79918575, + "num_input_tokens_seen": 338522864, + "router_z_loss_mlp": 0.1463623, + "step": 4081, + "time_per_iteration": 2.6469521522521973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073576, + "balance_loss_mlp": 1.05934227, + "epoch": 0.7853020392458638, + "flos": 569956207104.0, + "grad_norm": 0.07608494158988048, + "language_loss": 0.82050377, + "learning_rate": 0.00011608922363531393, + "loss": 0.83123952, + "num_input_tokens_seen": 338591024, + "router_z_loss_mlp": 0.14245605, + "step": 4082, + "time_per_iteration": 2.692795753479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075189, + "balance_loss_mlp": 1.06111002, + "epoch": 0.7854944209311273, + "flos": 832579845120.0, + "grad_norm": 0.08462347153699132, + "language_loss": 0.83413076, + "learning_rate": 0.00011588970430184504, + "loss": 0.84488267, + "num_input_tokens_seen": 338669616, + "router_z_loss_mlp": 0.14086914, + "step": 4083, + "time_per_iteration": 3.1208105087280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072687, + "balance_loss_mlp": 1.05856121, + "epoch": 0.7856868026163909, + "flos": 559929604608.0, + "grad_norm": 0.07095149348984836, + "language_loss": 0.81742346, + "learning_rate": 0.00011569033409321822, + "loss": 0.82815039, + "num_input_tokens_seen": 338740416, + "router_z_loss_mlp": 0.14135742, + "step": 4084, + "time_per_iteration": 2.7347347736358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068027, + "balance_loss_mlp": 1.05343556, + "epoch": 0.7858791843016545, + "flos": 545230725120.0, + "grad_norm": 0.07957990529540243, + "language_loss": 0.73091239, + "learning_rate": 0.00011549111308683591, + "loss": 0.74159265, + "num_input_tokens_seen": 338807664, + "router_z_loss_mlp": 0.14587402, + "step": 4085, + "time_per_iteration": 2.7169110774993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_mlp": 1.05705309, + "epoch": 0.7860715659869181, + "flos": 380997665280.0, + "grad_norm": 0.09071290102640983, + "language_loss": 0.80941343, + "learning_rate": 0.00011529204136004251, + "loss": 0.8201263, + "num_input_tokens_seen": 338869472, + "router_z_loss_mlp": 0.14233398, + "step": 4086, + "time_per_iteration": 2.452552318572998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066539, + "balance_loss_mlp": 1.05191231, + "epoch": 0.7862639476721817, + "flos": 567440930304.0, + "grad_norm": 0.05875076882668594, + "language_loss": 0.84497392, + "learning_rate": 0.00011509311899012459, + "loss": 0.85563934, + "num_input_tokens_seen": 338941312, + "router_z_loss_mlp": 0.14624023, + "step": 4087, + "time_per_iteration": 2.717156410217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072502, + "balance_loss_mlp": 1.05781543, + "epoch": 0.7864563293574451, + "flos": 545238065664.0, + "grad_norm": 0.09582325425007773, + "language_loss": 0.78019136, + "learning_rate": 0.00011489434605431053, + "loss": 0.79091644, + "num_input_tokens_seen": 339010208, + "router_z_loss_mlp": 0.14672852, + "step": 4088, + "time_per_iteration": 2.6889476776123047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065898, + "balance_loss_mlp": 1.05135465, + "epoch": 0.7866487110427087, + "flos": 563536963584.0, + "grad_norm": 0.07016527238188626, + "language_loss": 0.81085324, + "learning_rate": 0.0001146957226297708, + "loss": 0.82151222, + "num_input_tokens_seen": 339081232, + "router_z_loss_mlp": 0.14526367, + "step": 4089, + "time_per_iteration": 2.727022647857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070017, + "balance_loss_mlp": 1.05554497, + "epoch": 0.7868410927279723, + "flos": 728189968896.0, + "grad_norm": 0.08533113133407452, + "language_loss": 0.76128238, + "learning_rate": 0.00011449724879361827, + "loss": 0.77198255, + "num_input_tokens_seen": 339161040, + "router_z_loss_mlp": 0.14453125, + "step": 4090, + "time_per_iteration": 3.0626373291015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073737, + "balance_loss_mlp": 1.05938458, + "epoch": 0.7870334744132359, + "flos": 521355045888.0, + "grad_norm": 0.09697336218432462, + "language_loss": 0.7367081, + "learning_rate": 0.00011429892462290687, + "loss": 0.74744546, + "num_input_tokens_seen": 339233984, + "router_z_loss_mlp": 0.14343262, + "step": 4091, + "time_per_iteration": 2.688397169113159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066999, + "balance_loss_mlp": 1.05259871, + "epoch": 0.7872258560984994, + "flos": 451411107840.0, + "grad_norm": 0.06809709972371855, + "language_loss": 0.83140373, + "learning_rate": 0.00011410075019463295, + "loss": 0.84207374, + "num_input_tokens_seen": 339303168, + "router_z_loss_mlp": 0.1439209, + "step": 4092, + "time_per_iteration": 2.667365789413452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064434, + "balance_loss_mlp": 1.04979479, + "epoch": 0.787418237783763, + "flos": 515195334144.0, + "grad_norm": 0.0662823120947489, + "language_loss": 0.79980755, + "learning_rate": 0.00011390272558573461, + "loss": 0.81045187, + "num_input_tokens_seen": 339374512, + "router_z_loss_mlp": 0.1463623, + "step": 4093, + "time_per_iteration": 2.7487874031066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063058, + "balance_loss_mlp": 1.04871678, + "epoch": 0.7876106194690266, + "flos": 485081021952.0, + "grad_norm": 0.07241506189294278, + "language_loss": 0.80018187, + "learning_rate": 0.00011370485087309202, + "loss": 0.81081247, + "num_input_tokens_seen": 339442720, + "router_z_loss_mlp": 0.14343262, + "step": 4094, + "time_per_iteration": 2.6645123958587646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063246, + "balance_loss_mlp": 1.04858303, + "epoch": 0.7878030011542901, + "flos": 542841357312.0, + "grad_norm": 0.07706414391638888, + "language_loss": 0.79288125, + "learning_rate": 0.00011350712613352688, + "loss": 0.80351365, + "num_input_tokens_seen": 339508800, + "router_z_loss_mlp": 0.1463623, + "step": 4095, + "time_per_iteration": 2.700049877166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060614, + "balance_loss_mlp": 1.0458796, + "epoch": 0.7879953828395537, + "flos": 516739668480.0, + "grad_norm": 0.0878043495750585, + "language_loss": 0.79471409, + "learning_rate": 0.00011330955144380283, + "loss": 0.80532026, + "num_input_tokens_seen": 339578048, + "router_z_loss_mlp": 0.14733887, + "step": 4096, + "time_per_iteration": 2.652745008468628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106538, + "balance_loss_mlp": 1.0502882, + "epoch": 0.7881877645248172, + "flos": 582278201856.0, + "grad_norm": 0.08295045554320525, + "language_loss": 0.85968649, + "learning_rate": 0.00011311212688062483, + "loss": 0.87034023, + "num_input_tokens_seen": 339650176, + "router_z_loss_mlp": 0.15063477, + "step": 4097, + "time_per_iteration": 2.860481023788452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062768, + "balance_loss_mlp": 1.04778373, + "epoch": 0.7883801462100808, + "flos": 589171719168.0, + "grad_norm": 0.08289312695855233, + "language_loss": 0.77912939, + "learning_rate": 0.0001129148525206402, + "loss": 0.78975713, + "num_input_tokens_seen": 339727312, + "router_z_loss_mlp": 0.14953613, + "step": 4098, + "time_per_iteration": 2.8443920612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061859, + "balance_loss_mlp": 1.04729128, + "epoch": 0.7885725278953444, + "flos": 481728052224.0, + "grad_norm": 0.07565956052784888, + "language_loss": 0.86410785, + "learning_rate": 0.00011271772844043759, + "loss": 0.87472647, + "num_input_tokens_seen": 339801344, + "router_z_loss_mlp": 0.14562988, + "step": 4099, + "time_per_iteration": 2.67754864692688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_mlp": 1.04616427, + "epoch": 0.788764909580608, + "flos": 756794824704.0, + "grad_norm": 0.08256938816600788, + "language_loss": 0.76203871, + "learning_rate": 0.00011252075471654727, + "loss": 0.77265191, + "num_input_tokens_seen": 339877840, + "router_z_loss_mlp": 0.15136719, + "step": 4100, + "time_per_iteration": 2.9445242881774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062896, + "balance_loss_mlp": 1.04794669, + "epoch": 0.7889572912658714, + "flos": 702555213312.0, + "grad_norm": 0.06872757551446003, + "language_loss": 0.77701616, + "learning_rate": 0.00011232393142544133, + "loss": 0.7876451, + "num_input_tokens_seen": 339959568, + "router_z_loss_mlp": 0.14929199, + "step": 4101, + "time_per_iteration": 2.9418632984161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060972, + "balance_loss_mlp": 1.04578507, + "epoch": 0.789149672951135, + "flos": 736405364736.0, + "grad_norm": 0.0823367955958929, + "language_loss": 0.82776141, + "learning_rate": 0.00011212725864353323, + "loss": 0.83837116, + "num_input_tokens_seen": 340043600, + "router_z_loss_mlp": 0.15161133, + "step": 4102, + "time_per_iteration": 3.066218614578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009146, + "balance_loss_mlp": 1.002756, + "epoch": 0.7893420546363986, + "flos": 1481396511744.0, + "grad_norm": 0.00970990136946143, + "language_loss": 0.76335925, + "learning_rate": 0.00011193073644717822, + "loss": 0.77345079, + "num_input_tokens_seen": 340270608, + "router_z_loss_mlp": 0.06396484, + "step": 4103, + "time_per_iteration": 4.897639036178589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068091, + "balance_loss_mlp": 1.0529511, + "epoch": 0.7895344363216622, + "flos": 509072698368.0, + "grad_norm": 0.08329351881420698, + "language_loss": 0.75839722, + "learning_rate": 0.00011173436491267291, + "loss": 0.76907814, + "num_input_tokens_seen": 340338784, + "router_z_loss_mlp": 0.15148926, + "step": 4104, + "time_per_iteration": 2.572232484817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069348, + "balance_loss_mlp": 1.05428004, + "epoch": 0.7897268180069258, + "flos": 541988983296.0, + "grad_norm": 0.07889516146695053, + "language_loss": 0.81743544, + "learning_rate": 0.0001115381441162554, + "loss": 0.82812893, + "num_input_tokens_seen": 340407744, + "router_z_loss_mlp": 0.1505127, + "step": 4105, + "time_per_iteration": 2.6332814693450928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010093, + "balance_loss_mlp": 1.00375092, + "epoch": 0.7899191996921893, + "flos": 1412687817216.0, + "grad_norm": 0.008847709876927975, + "language_loss": 0.73583722, + "learning_rate": 0.00011134207413410557, + "loss": 0.74593818, + "num_input_tokens_seen": 340635824, + "router_z_loss_mlp": 0.06347656, + "step": 4106, + "time_per_iteration": 4.895688533782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066556, + "balance_loss_mlp": 1.05182195, + "epoch": 0.7901115813774529, + "flos": 622841633280.0, + "grad_norm": 0.06549029266923186, + "language_loss": 0.85235715, + "learning_rate": 0.00011114615504234465, + "loss": 0.86302269, + "num_input_tokens_seen": 340710928, + "router_z_loss_mlp": 0.1472168, + "step": 4107, + "time_per_iteration": 2.799600839614868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064328, + "balance_loss_mlp": 1.04949808, + "epoch": 0.7903039630627164, + "flos": 645545935872.0, + "grad_norm": 0.08208418526226827, + "language_loss": 0.80641502, + "learning_rate": 0.00011095038691703468, + "loss": 0.81705832, + "num_input_tokens_seen": 340786128, + "router_z_loss_mlp": 0.14819336, + "step": 4108, + "time_per_iteration": 2.877985715866089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062498, + "balance_loss_mlp": 1.0479306, + "epoch": 0.79049634474798, + "flos": 594365257728.0, + "grad_norm": 0.09971015959330254, + "language_loss": 0.82810932, + "learning_rate": 0.00011075476983417998, + "loss": 0.83873427, + "num_input_tokens_seen": 340861616, + "router_z_loss_mlp": 0.14550781, + "step": 4109, + "time_per_iteration": 2.881120204925537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106703, + "balance_loss_mlp": 1.05190194, + "epoch": 0.7906887264332435, + "flos": 716093001216.0, + "grad_norm": 0.08829657837561553, + "language_loss": 0.77800107, + "learning_rate": 0.00011055930386972579, + "loss": 0.78867137, + "num_input_tokens_seen": 340934480, + "router_z_loss_mlp": 0.15112305, + "step": 4110, + "time_per_iteration": 2.8346822261810303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068152, + "balance_loss_mlp": 1.05334675, + "epoch": 0.7908811081185071, + "flos": 789893918208.0, + "grad_norm": 0.07023814842259256, + "language_loss": 0.78629267, + "learning_rate": 0.00011036398909955863, + "loss": 0.79697418, + "num_input_tokens_seen": 341014912, + "router_z_loss_mlp": 0.14794922, + "step": 4111, + "time_per_iteration": 2.9915273189544678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072098, + "balance_loss_mlp": 1.05748332, + "epoch": 0.7910734898037707, + "flos": 641904072192.0, + "grad_norm": 0.06852892596590886, + "language_loss": 0.81336486, + "learning_rate": 0.00011016882559950648, + "loss": 0.82408583, + "num_input_tokens_seen": 341090608, + "router_z_loss_mlp": 0.14599609, + "step": 4112, + "time_per_iteration": 2.83972430229187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106327, + "balance_loss_mlp": 1.04859507, + "epoch": 0.7912658714890343, + "flos": 669357374976.0, + "grad_norm": 0.0738063160504073, + "language_loss": 0.80238831, + "learning_rate": 0.00010997381344533853, + "loss": 0.813021, + "num_input_tokens_seen": 341160992, + "router_z_loss_mlp": 0.14648438, + "step": 4113, + "time_per_iteration": 2.7973837852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073593, + "balance_loss_mlp": 1.0586915, + "epoch": 0.7914582531742979, + "flos": 557779944960.0, + "grad_norm": 0.07609152386132986, + "language_loss": 0.80731696, + "learning_rate": 0.00010977895271276517, + "loss": 0.81805289, + "num_input_tokens_seen": 341232032, + "router_z_loss_mlp": 0.14892578, + "step": 4114, + "time_per_iteration": 2.7021210193634033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106794, + "balance_loss_mlp": 1.05307484, + "epoch": 0.7916506348595613, + "flos": 570064863744.0, + "grad_norm": 0.06963604008344469, + "language_loss": 0.79982167, + "learning_rate": 0.00010958424347743807, + "loss": 0.8105011, + "num_input_tokens_seen": 341303888, + "router_z_loss_mlp": 0.14831543, + "step": 4115, + "time_per_iteration": 2.722219228744507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077261, + "balance_loss_mlp": 1.06282425, + "epoch": 0.7918430165448249, + "flos": 718301758464.0, + "grad_norm": 0.06932829196563554, + "language_loss": 0.80035752, + "learning_rate": 0.00010938968581494991, + "loss": 0.81113005, + "num_input_tokens_seen": 341385616, + "router_z_loss_mlp": 0.14440918, + "step": 4116, + "time_per_iteration": 3.020597457885742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107207, + "balance_loss_mlp": 1.05728841, + "epoch": 0.7920353982300885, + "flos": 553648753152.0, + "grad_norm": 0.08602194395595932, + "language_loss": 0.79036731, + "learning_rate": 0.000109195279800835, + "loss": 0.80108798, + "num_input_tokens_seen": 341460976, + "router_z_loss_mlp": 0.14758301, + "step": 4117, + "time_per_iteration": 2.752718210220337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068929, + "balance_loss_mlp": 1.05411148, + "epoch": 0.7922277799153521, + "flos": 810120019968.0, + "grad_norm": 0.08368902662154773, + "language_loss": 0.76681507, + "learning_rate": 0.00010900102551056834, + "loss": 0.77750438, + "num_input_tokens_seen": 341537328, + "router_z_loss_mlp": 0.14794922, + "step": 4118, + "time_per_iteration": 3.036240816116333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066229, + "balance_loss_mlp": 1.05164957, + "epoch": 0.7924201616006156, + "flos": 421351123968.0, + "grad_norm": 0.07604531563776018, + "language_loss": 0.84288156, + "learning_rate": 0.00010880692301956601, + "loss": 0.85354388, + "num_input_tokens_seen": 341600272, + "router_z_loss_mlp": 0.14550781, + "step": 4119, + "time_per_iteration": 2.493804693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074792, + "balance_loss_mlp": 1.05971193, + "epoch": 0.7926125432858792, + "flos": 617852924928.0, + "grad_norm": 0.06651444124896129, + "language_loss": 0.86047828, + "learning_rate": 0.00010861297240318518, + "loss": 0.87122619, + "num_input_tokens_seen": 341682096, + "router_z_loss_mlp": 0.1505127, + "step": 4120, + "time_per_iteration": 2.934854030609131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067653, + "balance_loss_mlp": 1.05324042, + "epoch": 0.7928049249711427, + "flos": 602487051264.0, + "grad_norm": 0.07241093769806302, + "language_loss": 0.86881423, + "learning_rate": 0.00010841917373672444, + "loss": 0.87949073, + "num_input_tokens_seen": 341754912, + "router_z_loss_mlp": 0.1439209, + "step": 4121, + "time_per_iteration": 2.7358059883117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071172, + "balance_loss_mlp": 1.0565567, + "epoch": 0.7929973066564063, + "flos": 656024790528.0, + "grad_norm": 0.08053859190471425, + "language_loss": 0.78445637, + "learning_rate": 0.00010822552709542293, + "loss": 0.79516816, + "num_input_tokens_seen": 341831152, + "router_z_loss_mlp": 0.14599609, + "step": 4122, + "time_per_iteration": 2.8181402683258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071522, + "balance_loss_mlp": 1.05708575, + "epoch": 0.7931896883416699, + "flos": 536397520896.0, + "grad_norm": 0.07023161642485896, + "language_loss": 0.85994995, + "learning_rate": 0.0001080320325544612, + "loss": 0.87066519, + "num_input_tokens_seen": 341903552, + "router_z_loss_mlp": 0.14428711, + "step": 4123, + "time_per_iteration": 2.666490316390991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070673, + "balance_loss_mlp": 1.05572367, + "epoch": 0.7933820700269334, + "flos": 498082493952.0, + "grad_norm": 0.1106860652376269, + "language_loss": 0.82816887, + "learning_rate": 0.00010783869018895997, + "loss": 0.83887559, + "num_input_tokens_seen": 341972256, + "router_z_loss_mlp": 0.14953613, + "step": 4124, + "time_per_iteration": 2.6471545696258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071579, + "balance_loss_mlp": 1.0571543, + "epoch": 0.793574451712197, + "flos": 537472350720.0, + "grad_norm": 0.07283258484620453, + "language_loss": 0.84189153, + "learning_rate": 0.00010764550007398189, + "loss": 0.85260737, + "num_input_tokens_seen": 342040496, + "router_z_loss_mlp": 0.14416504, + "step": 4125, + "time_per_iteration": 2.6587061882019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067686, + "balance_loss_mlp": 1.05301166, + "epoch": 0.7937668333974606, + "flos": 488285687808.0, + "grad_norm": 0.13078671480405682, + "language_loss": 0.81167138, + "learning_rate": 0.00010745246228452982, + "loss": 0.82234824, + "num_input_tokens_seen": 342108512, + "router_z_loss_mlp": 0.14660645, + "step": 4126, + "time_per_iteration": 2.645451784133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071819, + "balance_loss_mlp": 1.05741882, + "epoch": 0.7939592150827242, + "flos": 527425924608.0, + "grad_norm": 0.07416949151547285, + "language_loss": 0.81678915, + "learning_rate": 0.00010725957689554771, + "loss": 0.82750738, + "num_input_tokens_seen": 342183568, + "router_z_loss_mlp": 0.14379883, + "step": 4127, + "time_per_iteration": 2.765888214111328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068227, + "balance_loss_mlp": 1.05377841, + "epoch": 0.7941515967679876, + "flos": 541702287360.0, + "grad_norm": 0.059315040508318935, + "language_loss": 0.84973252, + "learning_rate": 0.00010706684398192013, + "loss": 0.86041474, + "num_input_tokens_seen": 342259920, + "router_z_loss_mlp": 0.14416504, + "step": 4128, + "time_per_iteration": 2.71177339553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073885, + "balance_loss_mlp": 1.0591861, + "epoch": 0.7943439784532512, + "flos": 518387516928.0, + "grad_norm": 0.07619386086866002, + "language_loss": 0.81954181, + "learning_rate": 0.00010687426361847313, + "loss": 0.83028066, + "num_input_tokens_seen": 342330192, + "router_z_loss_mlp": 0.14685059, + "step": 4129, + "time_per_iteration": 2.758657455444336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106652, + "balance_loss_mlp": 1.05164242, + "epoch": 0.7945363601385148, + "flos": 509025710592.0, + "grad_norm": 0.07169416903857827, + "language_loss": 0.85882586, + "learning_rate": 0.00010668183587997254, + "loss": 0.86949104, + "num_input_tokens_seen": 342398944, + "router_z_loss_mlp": 0.14868164, + "step": 4130, + "time_per_iteration": 2.596605062484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067605, + "balance_loss_mlp": 1.05271626, + "epoch": 0.7947287418237784, + "flos": 651214121472.0, + "grad_norm": 0.08455978064709659, + "language_loss": 0.77324224, + "learning_rate": 0.0001064895608411256, + "loss": 0.78391826, + "num_input_tokens_seen": 342474000, + "router_z_loss_mlp": 0.14868164, + "step": 4131, + "time_per_iteration": 2.796661853790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066687, + "balance_loss_mlp": 1.05177402, + "epoch": 0.794921123509042, + "flos": 696054477312.0, + "grad_norm": 0.0694318073220064, + "language_loss": 0.80456048, + "learning_rate": 0.00010629743857657998, + "loss": 0.81522739, + "num_input_tokens_seen": 342549184, + "router_z_loss_mlp": 0.14880371, + "step": 4132, + "time_per_iteration": 2.909764289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012901, + "balance_loss_mlp": 1.00636816, + "epoch": 0.7951135051943055, + "flos": 1402942768128.0, + "grad_norm": 0.005332975437914604, + "language_loss": 0.70598668, + "learning_rate": 0.0001061054691609244, + "loss": 0.71611571, + "num_input_tokens_seen": 342767376, + "router_z_loss_mlp": 0.06542969, + "step": 4133, + "time_per_iteration": 4.588430166244507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107322, + "balance_loss_mlp": 1.05867648, + "epoch": 0.795305886879569, + "flos": 810085515264.0, + "grad_norm": 0.06887248612226421, + "language_loss": 0.82134831, + "learning_rate": 0.00010591365266868802, + "loss": 0.83208048, + "num_input_tokens_seen": 342845024, + "router_z_loss_mlp": 0.14550781, + "step": 4134, + "time_per_iteration": 2.9644124507904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012906, + "balance_loss_mlp": 1.00632596, + "epoch": 0.7954982685648326, + "flos": 1426005347328.0, + "grad_norm": 0.005331901517009852, + "language_loss": 0.75511783, + "learning_rate": 0.00010572198917434018, + "loss": 0.76524687, + "num_input_tokens_seen": 343072496, + "router_z_loss_mlp": 0.06591797, + "step": 4135, + "time_per_iteration": 4.896401405334473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068705, + "balance_loss_mlp": 1.05358934, + "epoch": 0.7956906502500962, + "flos": 389885197824.0, + "grad_norm": 0.06724470827619233, + "language_loss": 0.79032838, + "learning_rate": 0.00010553047875229166, + "loss": 0.80101544, + "num_input_tokens_seen": 343136928, + "router_z_loss_mlp": 0.15100098, + "step": 4136, + "time_per_iteration": 2.5450961589813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066364, + "balance_loss_mlp": 1.05189192, + "epoch": 0.7958830319353598, + "flos": 515573434368.0, + "grad_norm": 0.07076357232689101, + "language_loss": 0.83468044, + "learning_rate": 0.00010533912147689328, + "loss": 0.84534407, + "num_input_tokens_seen": 343207440, + "router_z_loss_mlp": 0.14465332, + "step": 4137, + "time_per_iteration": 2.693084239959717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070158, + "balance_loss_mlp": 1.05553102, + "epoch": 0.7960754136206233, + "flos": 493941390336.0, + "grad_norm": 0.06121658887981785, + "language_loss": 0.8226397, + "learning_rate": 0.00010514791742243656, + "loss": 0.8333413, + "num_input_tokens_seen": 343273744, + "router_z_loss_mlp": 0.14599609, + "step": 4138, + "time_per_iteration": 2.6134862899780273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066373, + "balance_loss_mlp": 1.05163932, + "epoch": 0.7962677953058869, + "flos": 655728182784.0, + "grad_norm": 0.07416353979296561, + "language_loss": 0.82627141, + "learning_rate": 0.00010495686666315341, + "loss": 0.83693522, + "num_input_tokens_seen": 343357648, + "router_z_loss_mlp": 0.14733887, + "step": 4139, + "time_per_iteration": 2.8959176540374756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071604, + "balance_loss_mlp": 1.05745411, + "epoch": 0.7964601769911505, + "flos": 542384335872.0, + "grad_norm": 0.08076949744760686, + "language_loss": 0.77108532, + "learning_rate": 0.00010476596927321635, + "loss": 0.78180134, + "num_input_tokens_seen": 343425344, + "router_z_loss_mlp": 0.14147949, + "step": 4140, + "time_per_iteration": 2.6166224479675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067987, + "balance_loss_mlp": 1.05282402, + "epoch": 0.796652558676414, + "flos": 537650016768.0, + "grad_norm": 0.07641249861388391, + "language_loss": 0.8031469, + "learning_rate": 0.00010457522532673835, + "loss": 0.8138268, + "num_input_tokens_seen": 343504960, + "router_z_loss_mlp": 0.15136719, + "step": 4141, + "time_per_iteration": 2.8061392307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073775, + "balance_loss_mlp": 1.05888581, + "epoch": 0.7968449403616775, + "flos": 475091495424.0, + "grad_norm": 0.082895122944158, + "language_loss": 0.8321951, + "learning_rate": 0.00010438463489777272, + "loss": 0.84293288, + "num_input_tokens_seen": 343570832, + "router_z_loss_mlp": 0.14892578, + "step": 4142, + "time_per_iteration": 2.563521385192871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066066, + "balance_loss_mlp": 1.05122459, + "epoch": 0.7970373220469411, + "flos": 567613827072.0, + "grad_norm": 0.07215110676242628, + "language_loss": 0.77859384, + "learning_rate": 0.00010419419806031316, + "loss": 0.78925455, + "num_input_tokens_seen": 343639808, + "router_z_loss_mlp": 0.14807129, + "step": 4143, + "time_per_iteration": 2.692662000656128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075522, + "balance_loss_mlp": 1.06125236, + "epoch": 0.7972297037322047, + "flos": 556208446464.0, + "grad_norm": 0.1076253909846465, + "language_loss": 0.83906108, + "learning_rate": 0.00010400391488829403, + "loss": 0.84981632, + "num_input_tokens_seen": 343715232, + "router_z_loss_mlp": 0.1427002, + "step": 4144, + "time_per_iteration": 2.853351593017578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107537, + "balance_loss_mlp": 1.06045663, + "epoch": 0.7974220854174683, + "flos": 576180158976.0, + "grad_norm": 0.14917315056572417, + "language_loss": 0.86392915, + "learning_rate": 0.00010381378545558984, + "loss": 0.87468284, + "num_input_tokens_seen": 343787168, + "router_z_loss_mlp": 0.14892578, + "step": 4145, + "time_per_iteration": 2.7161378860473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070419, + "balance_loss_mlp": 1.05575621, + "epoch": 0.7976144671027319, + "flos": 483069754368.0, + "grad_norm": 0.08038510657602778, + "language_loss": 0.8457576, + "learning_rate": 0.00010362380983601505, + "loss": 0.85646176, + "num_input_tokens_seen": 343853600, + "router_z_loss_mlp": 0.14648438, + "step": 4146, + "time_per_iteration": 2.5544986724853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068569, + "balance_loss_mlp": 1.05391836, + "epoch": 0.7978068487879953, + "flos": 1077865615872.0, + "grad_norm": 0.05616342644239884, + "language_loss": 0.78731227, + "learning_rate": 0.00010343398810332477, + "loss": 0.79799801, + "num_input_tokens_seen": 343942816, + "router_z_loss_mlp": 0.1463623, + "step": 4147, + "time_per_iteration": 3.4725289344787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070017, + "balance_loss_mlp": 1.05533075, + "epoch": 0.7979992304732589, + "flos": 733739586048.0, + "grad_norm": 0.07604084389723553, + "language_loss": 0.84285581, + "learning_rate": 0.00010324432033121467, + "loss": 0.85355598, + "num_input_tokens_seen": 344021232, + "router_z_loss_mlp": 0.14672852, + "step": 4148, + "time_per_iteration": 2.925584554672241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072951, + "balance_loss_mlp": 1.05818057, + "epoch": 0.7981916121585225, + "flos": 415774342656.0, + "grad_norm": 0.07506198760098327, + "language_loss": 0.83406597, + "learning_rate": 0.00010305480659332005, + "loss": 0.84479547, + "num_input_tokens_seen": 344089616, + "router_z_loss_mlp": 0.14746094, + "step": 4149, + "time_per_iteration": 2.6363680362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073859, + "balance_loss_mlp": 1.05917299, + "epoch": 0.7983839938437861, + "flos": 465257613312.0, + "grad_norm": 0.07209752388462913, + "language_loss": 0.83577174, + "learning_rate": 0.00010286544696321682, + "loss": 0.84651035, + "num_input_tokens_seen": 344154992, + "router_z_loss_mlp": 0.14685059, + "step": 4150, + "time_per_iteration": 2.5717275142669678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071445, + "balance_loss_mlp": 1.05673504, + "epoch": 0.7985763755290496, + "flos": 510567473664.0, + "grad_norm": 0.08223257276108414, + "language_loss": 0.79523313, + "learning_rate": 0.00010267624151442073, + "loss": 0.80594754, + "num_input_tokens_seen": 344225232, + "router_z_loss_mlp": 0.14685059, + "step": 4151, + "time_per_iteration": 2.6743481159210205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069723, + "balance_loss_mlp": 1.05467856, + "epoch": 0.7987687572143132, + "flos": 1010649498624.0, + "grad_norm": 0.0703143889745847, + "language_loss": 0.80934834, + "learning_rate": 0.000102487190320388, + "loss": 0.82004559, + "num_input_tokens_seen": 344309120, + "router_z_loss_mlp": 0.15014648, + "step": 4152, + "time_per_iteration": 3.3981220722198486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067559, + "balance_loss_mlp": 1.05272949, + "epoch": 0.7989611388995768, + "flos": 1021078794240.0, + "grad_norm": 0.32574544217784795, + "language_loss": 0.79635817, + "learning_rate": 0.00010229829345451475, + "loss": 0.80703378, + "num_input_tokens_seen": 344394112, + "router_z_loss_mlp": 0.14819336, + "step": 4153, + "time_per_iteration": 3.3228917121887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074745, + "balance_loss_mlp": 1.06002223, + "epoch": 0.7991535205848403, + "flos": 1101338601984.0, + "grad_norm": 0.06282548479751149, + "language_loss": 0.79764807, + "learning_rate": 0.00010210955099013724, + "loss": 0.8083955, + "num_input_tokens_seen": 344476512, + "router_z_loss_mlp": 0.14709473, + "step": 4154, + "time_per_iteration": 3.412867784500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070719, + "balance_loss_mlp": 1.05562687, + "epoch": 0.7993459022701039, + "flos": 834818337792.0, + "grad_norm": 0.0818211478301838, + "language_loss": 0.76729739, + "learning_rate": 0.00010192096300053167, + "loss": 0.77800465, + "num_input_tokens_seen": 344561088, + "router_z_loss_mlp": 0.15063477, + "step": 4155, + "time_per_iteration": 3.071514368057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071934, + "balance_loss_mlp": 1.05697358, + "epoch": 0.7995382839553674, + "flos": 522686836224.0, + "grad_norm": 0.06336335817254321, + "language_loss": 0.85153681, + "learning_rate": 0.00010173252955891477, + "loss": 0.86225611, + "num_input_tokens_seen": 344639424, + "router_z_loss_mlp": 0.14941406, + "step": 4156, + "time_per_iteration": 2.8336803913116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078348, + "balance_loss_mlp": 1.06368518, + "epoch": 0.799730665640631, + "flos": 537820715520.0, + "grad_norm": 0.07241348777253756, + "language_loss": 0.73074377, + "learning_rate": 0.00010154425073844253, + "loss": 0.74152726, + "num_input_tokens_seen": 344710048, + "router_z_loss_mlp": 0.1463623, + "step": 4157, + "time_per_iteration": 2.708444356918335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071825, + "balance_loss_mlp": 1.05725741, + "epoch": 0.7999230473258946, + "flos": 505060075008.0, + "grad_norm": 0.05965313173183175, + "language_loss": 0.82319427, + "learning_rate": 0.00010135612661221138, + "loss": 0.83391249, + "num_input_tokens_seen": 344776832, + "router_z_loss_mlp": 0.14562988, + "step": 4158, + "time_per_iteration": 2.5790717601776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074651, + "balance_loss_mlp": 1.06004786, + "epoch": 0.8001154290111582, + "flos": 1027342393344.0, + "grad_norm": 0.07976593337081749, + "language_loss": 0.81996578, + "learning_rate": 0.00010116815725325751, + "loss": 0.83071226, + "num_input_tokens_seen": 344864928, + "router_z_loss_mlp": 0.14587402, + "step": 4159, + "time_per_iteration": 3.352048635482788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074283, + "balance_loss_mlp": 1.0596205, + "epoch": 0.8003078106964217, + "flos": 750906754560.0, + "grad_norm": 0.07121421414311549, + "language_loss": 0.80415642, + "learning_rate": 0.00010098034273455725, + "loss": 0.81489933, + "num_input_tokens_seen": 344944048, + "router_z_loss_mlp": 0.1463623, + "step": 4160, + "time_per_iteration": 2.9569175243377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071179, + "balance_loss_mlp": 1.05632544, + "epoch": 0.8005001923816852, + "flos": 488465925120.0, + "grad_norm": 0.06668806534008753, + "language_loss": 0.79674023, + "learning_rate": 0.00010079268312902662, + "loss": 0.80745208, + "num_input_tokens_seen": 345015392, + "router_z_loss_mlp": 0.14831543, + "step": 4161, + "time_per_iteration": 2.6834394931793213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075834, + "balance_loss_mlp": 1.06082582, + "epoch": 0.8006925740669488, + "flos": 513248306688.0, + "grad_norm": 0.08968184454312078, + "language_loss": 0.81960094, + "learning_rate": 0.0001006051785095215, + "loss": 0.83035922, + "num_input_tokens_seen": 345086640, + "router_z_loss_mlp": 0.14978027, + "step": 4162, + "time_per_iteration": 2.737863779067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073874, + "balance_loss_mlp": 1.05921173, + "epoch": 0.8008849557522124, + "flos": 578529879552.0, + "grad_norm": 0.09340596389529475, + "language_loss": 0.79312497, + "learning_rate": 0.0001004178289488376, + "loss": 0.8038637, + "num_input_tokens_seen": 345159616, + "router_z_loss_mlp": 0.14672852, + "step": 4163, + "time_per_iteration": 2.7409329414367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074352, + "balance_loss_mlp": 1.05973649, + "epoch": 0.801077337437476, + "flos": 478708766208.0, + "grad_norm": 0.07216515601811406, + "language_loss": 0.84130692, + "learning_rate": 0.0001002306345197106, + "loss": 0.85205042, + "num_input_tokens_seen": 345225536, + "router_z_loss_mlp": 0.14599609, + "step": 4164, + "time_per_iteration": 2.537263870239258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078084, + "balance_loss_mlp": 1.06349313, + "epoch": 0.8012697191227395, + "flos": 676700573184.0, + "grad_norm": 0.07365299620590934, + "language_loss": 0.80348939, + "learning_rate": 0.00010004359529481571, + "loss": 0.81427026, + "num_input_tokens_seen": 345302960, + "router_z_loss_mlp": 0.14575195, + "step": 4165, + "time_per_iteration": 2.8671815395355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078878, + "balance_loss_mlp": 1.0639292, + "epoch": 0.8014621008080031, + "flos": 1295132405760.0, + "grad_norm": 0.08098076628944305, + "language_loss": 0.82058138, + "learning_rate": 9.985671134676804e-05, + "loss": 0.83137012, + "num_input_tokens_seen": 345397792, + "router_z_loss_mlp": 0.14941406, + "step": 4166, + "time_per_iteration": 3.699275255203247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076335, + "balance_loss_mlp": 1.06169629, + "epoch": 0.8016544824932667, + "flos": 511827683328.0, + "grad_norm": 0.0841721873236777, + "language_loss": 0.82996416, + "learning_rate": 9.966998274812234e-05, + "loss": 0.84072757, + "num_input_tokens_seen": 345465440, + "router_z_loss_mlp": 0.14611816, + "step": 4167, + "time_per_iteration": 2.6094071865081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078702, + "balance_loss_mlp": 1.06408715, + "epoch": 0.8018468641785302, + "flos": 535690879488.0, + "grad_norm": 0.09563124315006066, + "language_loss": 0.80843663, + "learning_rate": 9.948340957137308e-05, + "loss": 0.8192237, + "num_input_tokens_seen": 345533072, + "router_z_loss_mlp": 0.14611816, + "step": 4168, + "time_per_iteration": 2.6237661838531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079459, + "balance_loss_mlp": 1.0646534, + "epoch": 0.8020392458637937, + "flos": 1023431086080.0, + "grad_norm": 0.0771033219349132, + "language_loss": 0.79519576, + "learning_rate": 9.929699188895447e-05, + "loss": 0.8059904, + "num_input_tokens_seen": 345622208, + "router_z_loss_mlp": 0.14794922, + "step": 4169, + "time_per_iteration": 3.28833270072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027031, + "balance_loss_mlp": 1.02049804, + "epoch": 0.8022316275490573, + "flos": 1561806821376.0, + "grad_norm": 0.022525572886173285, + "language_loss": 0.78054404, + "learning_rate": 9.911072977324009e-05, + "loss": 0.79081434, + "num_input_tokens_seen": 345852544, + "router_z_loss_mlp": 0.06542969, + "step": 4170, + "time_per_iteration": 4.9581146240234375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079183, + "balance_loss_mlp": 1.06417489, + "epoch": 0.8024240092343209, + "flos": 420698810880.0, + "grad_norm": 0.08083789363568177, + "language_loss": 0.83295381, + "learning_rate": 9.89246232965435e-05, + "loss": 0.84374571, + "num_input_tokens_seen": 345917328, + "router_z_loss_mlp": 0.14990234, + "step": 4171, + "time_per_iteration": 2.5198962688446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107979, + "balance_loss_mlp": 1.0656513, + "epoch": 0.8026163909195845, + "flos": 763836645888.0, + "grad_norm": 0.0840583068148426, + "language_loss": 0.7862519, + "learning_rate": 9.873867253111762e-05, + "loss": 0.79704976, + "num_input_tokens_seen": 345995936, + "router_z_loss_mlp": 0.14147949, + "step": 4172, + "time_per_iteration": 2.9434571266174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020525, + "balance_loss_mlp": 1.01408792, + "epoch": 0.8028087726048481, + "flos": 1518861362688.0, + "grad_norm": 0.0182097422778206, + "language_loss": 0.80264562, + "learning_rate": 9.855287754915503e-05, + "loss": 0.81285089, + "num_input_tokens_seen": 346232720, + "router_z_loss_mlp": 0.06445312, + "step": 4173, + "time_per_iteration": 4.941962718963623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076678, + "balance_loss_mlp": 1.06175327, + "epoch": 0.8030011542901115, + "flos": 517861486080.0, + "grad_norm": 0.16472070475326986, + "language_loss": 0.88381541, + "learning_rate": 9.836723842278733e-05, + "loss": 0.89458215, + "num_input_tokens_seen": 346298208, + "router_z_loss_mlp": 0.14892578, + "step": 4174, + "time_per_iteration": 2.6340060234069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079926, + "balance_loss_mlp": 1.0656451, + "epoch": 0.8031935359753751, + "flos": 545616165888.0, + "grad_norm": 0.07241242292177963, + "language_loss": 0.78390783, + "learning_rate": 9.818175522408646e-05, + "loss": 0.79470706, + "num_input_tokens_seen": 346370080, + "router_z_loss_mlp": 0.1427002, + "step": 4175, + "time_per_iteration": 2.7003095149993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075656, + "balance_loss_mlp": 1.06123185, + "epoch": 0.8033859176606387, + "flos": 603559309824.0, + "grad_norm": 0.08397414086825541, + "language_loss": 0.84535128, + "learning_rate": 9.79964280250632e-05, + "loss": 0.85610783, + "num_input_tokens_seen": 346442432, + "router_z_loss_mlp": 0.14416504, + "step": 4176, + "time_per_iteration": 2.842618227005005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071865, + "balance_loss_mlp": 1.05723834, + "epoch": 0.8035782993459023, + "flos": 565859520000.0, + "grad_norm": 0.08177365403070841, + "language_loss": 0.81297785, + "learning_rate": 9.781125689766795e-05, + "loss": 0.82369649, + "num_input_tokens_seen": 346513088, + "router_z_loss_mlp": 0.14624023, + "step": 4177, + "time_per_iteration": 2.7389862537384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077895, + "balance_loss_mlp": 1.06313717, + "epoch": 0.8037706810311658, + "flos": 538435952640.0, + "grad_norm": 0.07713213601435066, + "language_loss": 0.84999192, + "learning_rate": 9.762624191379054e-05, + "loss": 0.86077082, + "num_input_tokens_seen": 346581376, + "router_z_loss_mlp": 0.14733887, + "step": 4178, + "time_per_iteration": 2.6558520793914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070602, + "balance_loss_mlp": 1.05615425, + "epoch": 0.8039630627164294, + "flos": 515187993600.0, + "grad_norm": 0.07194102205057808, + "language_loss": 0.79348469, + "learning_rate": 9.744138314526014e-05, + "loss": 0.80419075, + "num_input_tokens_seen": 346653328, + "router_z_loss_mlp": 0.14428711, + "step": 4179, + "time_per_iteration": 2.6974868774414062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01013319, + "balance_loss_mlp": 1.00692964, + "epoch": 0.804155444401693, + "flos": 1478834247168.0, + "grad_norm": 0.009099192400520165, + "language_loss": 0.74733561, + "learning_rate": 9.725668066384535e-05, + "loss": 0.75746888, + "num_input_tokens_seen": 346873264, + "router_z_loss_mlp": 0.06396484, + "step": 4180, + "time_per_iteration": 4.910180330276489 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078163, + "balance_loss_mlp": 1.06361914, + "epoch": 0.8043478260869565, + "flos": 521164896768.0, + "grad_norm": 0.06460867004727015, + "language_loss": 0.76895148, + "learning_rate": 9.707213454125396e-05, + "loss": 0.77973306, + "num_input_tokens_seen": 346946272, + "router_z_loss_mlp": 0.14538574, + "step": 4181, + "time_per_iteration": 2.673661470413208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070028, + "balance_loss_mlp": 1.05518675, + "epoch": 0.8045402077722201, + "flos": 545448038400.0, + "grad_norm": 0.06289883522471808, + "language_loss": 0.80526221, + "learning_rate": 9.688774484913298e-05, + "loss": 0.81596249, + "num_input_tokens_seen": 347024048, + "router_z_loss_mlp": 0.14819336, + "step": 4182, + "time_per_iteration": 2.8102376461029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107591, + "balance_loss_mlp": 1.06159282, + "epoch": 0.8047325894574836, + "flos": 678388068864.0, + "grad_norm": 0.06393173875827637, + "language_loss": 0.74231839, + "learning_rate": 9.670351165906921e-05, + "loss": 0.75307751, + "num_input_tokens_seen": 347108736, + "router_z_loss_mlp": 0.14306641, + "step": 4183, + "time_per_iteration": 2.9303932189941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107065, + "balance_loss_mlp": 1.05599904, + "epoch": 0.8049249711427472, + "flos": 587227262976.0, + "grad_norm": 0.09136907696197756, + "language_loss": 0.78323948, + "learning_rate": 9.65194350425882e-05, + "loss": 0.79394597, + "num_input_tokens_seen": 347184192, + "router_z_loss_mlp": 0.1463623, + "step": 4184, + "time_per_iteration": 2.787539005279541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069011, + "balance_loss_mlp": 1.05470586, + "epoch": 0.8051173528280108, + "flos": 814194312192.0, + "grad_norm": 0.08523258631943265, + "language_loss": 0.77739137, + "learning_rate": 9.633551507115452e-05, + "loss": 0.78808153, + "num_input_tokens_seen": 347282336, + "router_z_loss_mlp": 0.14306641, + "step": 4185, + "time_per_iteration": 3.130908727645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075899, + "balance_loss_mlp": 1.06085443, + "epoch": 0.8053097345132744, + "flos": 725687175168.0, + "grad_norm": 0.12627970759044813, + "language_loss": 0.77332032, + "learning_rate": 9.615175181617259e-05, + "loss": 0.78407931, + "num_input_tokens_seen": 347364800, + "router_z_loss_mlp": 0.15026855, + "step": 4186, + "time_per_iteration": 2.9669125080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_mlp": 1.06421077, + "epoch": 0.805502116198538, + "flos": 748050453504.0, + "grad_norm": 0.0799806090831211, + "language_loss": 0.81470084, + "learning_rate": 9.596814534898552e-05, + "loss": 0.82548958, + "num_input_tokens_seen": 347443328, + "router_z_loss_mlp": 0.14648438, + "step": 4187, + "time_per_iteration": 3.013604164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072205, + "balance_loss_mlp": 1.05763793, + "epoch": 0.8056944978838014, + "flos": 640258421760.0, + "grad_norm": 0.06438173450385795, + "language_loss": 0.87444198, + "learning_rate": 9.578469574087561e-05, + "loss": 0.88516408, + "num_input_tokens_seen": 347522064, + "router_z_loss_mlp": 0.14562988, + "step": 4188, + "time_per_iteration": 2.8994572162628174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072497, + "balance_loss_mlp": 1.05785775, + "epoch": 0.805886879569065, + "flos": 644631892992.0, + "grad_norm": 0.08726456548799634, + "language_loss": 0.78306341, + "learning_rate": 9.560140306306436e-05, + "loss": 0.79378831, + "num_input_tokens_seen": 347597200, + "router_z_loss_mlp": 0.14624023, + "step": 4189, + "time_per_iteration": 2.7558131217956543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076011, + "balance_loss_mlp": 1.06158686, + "epoch": 0.8060792612543286, + "flos": 661230812160.0, + "grad_norm": 0.07215370646866548, + "language_loss": 0.81434023, + "learning_rate": 9.541826738671233e-05, + "loss": 0.8251003, + "num_input_tokens_seen": 347676928, + "router_z_loss_mlp": 0.14404297, + "step": 4190, + "time_per_iteration": 2.8377161026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073247, + "balance_loss_mlp": 1.05854881, + "epoch": 0.8062716429395922, + "flos": 455075366400.0, + "grad_norm": 0.08365860957548234, + "language_loss": 0.8272016, + "learning_rate": 9.523528878291904e-05, + "loss": 0.83793408, + "num_input_tokens_seen": 347741552, + "router_z_loss_mlp": 0.14697266, + "step": 4191, + "time_per_iteration": 2.5463461875915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073967, + "balance_loss_mlp": 1.05886352, + "epoch": 0.8064640246248557, + "flos": 526407994368.0, + "grad_norm": 0.08656547672961308, + "language_loss": 0.85237193, + "learning_rate": 9.50524673227231e-05, + "loss": 0.86311156, + "num_input_tokens_seen": 347807008, + "router_z_loss_mlp": 0.15087891, + "step": 4192, + "time_per_iteration": 2.628451108932495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074197, + "balance_loss_mlp": 1.05997539, + "epoch": 0.8066564063101193, + "flos": 865115458560.0, + "grad_norm": 0.060557734767924705, + "language_loss": 0.81796318, + "learning_rate": 9.486980307710208e-05, + "loss": 0.82870519, + "num_input_tokens_seen": 347895728, + "router_z_loss_mlp": 0.14233398, + "step": 4193, + "time_per_iteration": 3.221529960632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073516, + "balance_loss_mlp": 1.05888867, + "epoch": 0.8068487879953828, + "flos": 530536614912.0, + "grad_norm": 0.06679701242235103, + "language_loss": 0.81742352, + "learning_rate": 9.468729611697246e-05, + "loss": 0.82815868, + "num_input_tokens_seen": 347970368, + "router_z_loss_mlp": 0.14599609, + "step": 4194, + "time_per_iteration": 2.7180535793304443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107158, + "balance_loss_mlp": 1.05689359, + "epoch": 0.8070411696806464, + "flos": 566183291904.0, + "grad_norm": 0.06755378291949884, + "language_loss": 0.81656551, + "learning_rate": 9.450494651319003e-05, + "loss": 0.8272813, + "num_input_tokens_seen": 348039040, + "router_z_loss_mlp": 0.14672852, + "step": 4195, + "time_per_iteration": 2.661775827407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072945, + "balance_loss_mlp": 1.05812728, + "epoch": 0.80723355136591, + "flos": 986591010816.0, + "grad_norm": 0.05699418156609254, + "language_loss": 0.79166675, + "learning_rate": 9.432275433654885e-05, + "loss": 0.80239624, + "num_input_tokens_seen": 348126064, + "router_z_loss_mlp": 0.14794922, + "step": 4196, + "time_per_iteration": 3.3003180027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107439, + "balance_loss_mlp": 1.05973983, + "epoch": 0.8074259330511735, + "flos": 566961513984.0, + "grad_norm": 0.0689415903823296, + "language_loss": 0.82874274, + "learning_rate": 9.414071965778221e-05, + "loss": 0.83948666, + "num_input_tokens_seen": 348205888, + "router_z_loss_mlp": 0.1463623, + "step": 4197, + "time_per_iteration": 2.79154896736145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077001, + "balance_loss_mlp": 1.0621475, + "epoch": 0.8076183147364371, + "flos": 494662712832.0, + "grad_norm": 0.06697307053985302, + "language_loss": 0.79652965, + "learning_rate": 9.395884254756242e-05, + "loss": 0.80729973, + "num_input_tokens_seen": 348278608, + "router_z_loss_mlp": 0.14831543, + "step": 4198, + "time_per_iteration": 2.7206079959869385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074545, + "balance_loss_mlp": 1.06003702, + "epoch": 0.8078106964217007, + "flos": 420011993088.0, + "grad_norm": 0.0821513988093656, + "language_loss": 0.79780805, + "learning_rate": 9.377712307650044e-05, + "loss": 0.80855346, + "num_input_tokens_seen": 348341312, + "router_z_loss_mlp": 0.14489746, + "step": 4199, + "time_per_iteration": 2.510125160217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074169, + "balance_loss_mlp": 1.05935168, + "epoch": 0.8080030781069643, + "flos": 527537152512.0, + "grad_norm": 0.07168048357507804, + "language_loss": 0.83144093, + "learning_rate": 9.359556131514602e-05, + "loss": 0.84218264, + "num_input_tokens_seen": 348409184, + "router_z_loss_mlp": 0.14794922, + "step": 4200, + "time_per_iteration": 2.659519910812378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107465, + "balance_loss_mlp": 1.05992758, + "epoch": 0.8081954597922277, + "flos": 544148554752.0, + "grad_norm": 0.061739081334624905, + "language_loss": 0.81328112, + "learning_rate": 9.341415733398733e-05, + "loss": 0.82402754, + "num_input_tokens_seen": 348480832, + "router_z_loss_mlp": 0.14697266, + "step": 4201, + "time_per_iteration": 2.641256809234619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073727, + "balance_loss_mlp": 1.0594337, + "epoch": 0.8083878414774913, + "flos": 640900823040.0, + "grad_norm": 0.07419172018903049, + "language_loss": 0.75228035, + "learning_rate": 9.323291120345207e-05, + "loss": 0.76301754, + "num_input_tokens_seen": 348559232, + "router_z_loss_mlp": 0.14294434, + "step": 4202, + "time_per_iteration": 2.8421621322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077714, + "balance_loss_mlp": 1.06293166, + "epoch": 0.8085802231627549, + "flos": 705614146560.0, + "grad_norm": 0.0834576005577422, + "language_loss": 0.72859406, + "learning_rate": 9.305182299390614e-05, + "loss": 0.73937118, + "num_input_tokens_seen": 348638960, + "router_z_loss_mlp": 0.14746094, + "step": 4203, + "time_per_iteration": 2.9009647369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068048, + "balance_loss_mlp": 1.05312276, + "epoch": 0.8087726048480185, + "flos": 419762373120.0, + "grad_norm": 0.07762289218582992, + "language_loss": 0.88771188, + "learning_rate": 9.287089277565409e-05, + "loss": 0.89839238, + "num_input_tokens_seen": 348704816, + "router_z_loss_mlp": 0.14904785, + "step": 4204, + "time_per_iteration": 2.6274211406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075402, + "balance_loss_mlp": 1.06070352, + "epoch": 0.8089649865332821, + "flos": 508766178816.0, + "grad_norm": 0.07618621801756342, + "language_loss": 0.87048995, + "learning_rate": 9.269012061893922e-05, + "loss": 0.881244, + "num_input_tokens_seen": 348783504, + "router_z_loss_mlp": 0.14697266, + "step": 4205, + "time_per_iteration": 2.7980542182922363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107201, + "balance_loss_mlp": 1.05760992, + "epoch": 0.8091573682185456, + "flos": 457219883520.0, + "grad_norm": 0.06817145148860111, + "language_loss": 0.85155141, + "learning_rate": 9.250950659394386e-05, + "loss": 0.86227149, + "num_input_tokens_seen": 348858272, + "router_z_loss_mlp": 0.14404297, + "step": 4206, + "time_per_iteration": 2.7548696994781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068706, + "balance_loss_mlp": 1.05355477, + "epoch": 0.8093497499038091, + "flos": 525256441344.0, + "grad_norm": 0.07651954688486194, + "language_loss": 0.7713989, + "learning_rate": 9.232905077078824e-05, + "loss": 0.78208601, + "num_input_tokens_seen": 348934432, + "router_z_loss_mlp": 0.15124512, + "step": 4207, + "time_per_iteration": 2.7961602210998535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078891, + "balance_loss_mlp": 1.06387043, + "epoch": 0.8095421315890727, + "flos": 489617478144.0, + "grad_norm": 0.07872605928187458, + "language_loss": 0.76999003, + "learning_rate": 9.214875321953164e-05, + "loss": 0.78077894, + "num_input_tokens_seen": 349003856, + "router_z_loss_mlp": 0.15002441, + "step": 4208, + "time_per_iteration": 2.5866055488586426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067244, + "balance_loss_mlp": 1.05254602, + "epoch": 0.8097345132743363, + "flos": 625109861376.0, + "grad_norm": 0.06523356997123914, + "language_loss": 0.8081665, + "learning_rate": 9.196861401017164e-05, + "loss": 0.81883889, + "num_input_tokens_seen": 349080544, + "router_z_loss_mlp": 0.14685059, + "step": 4209, + "time_per_iteration": 2.789491653442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075999, + "balance_loss_mlp": 1.06115699, + "epoch": 0.8099268949595998, + "flos": 615688584192.0, + "grad_norm": 0.06679247683416532, + "language_loss": 0.79083157, + "learning_rate": 9.178863321264475e-05, + "loss": 0.80159163, + "num_input_tokens_seen": 349159072, + "router_z_loss_mlp": 0.14819336, + "step": 4210, + "time_per_iteration": 2.80202579498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072328, + "balance_loss_mlp": 1.05728388, + "epoch": 0.8101192766448634, + "flos": 479642632704.0, + "grad_norm": 0.08620954962074664, + "language_loss": 0.79814863, + "learning_rate": 9.160881089682566e-05, + "loss": 0.80887187, + "num_input_tokens_seen": 349230176, + "router_z_loss_mlp": 0.15026855, + "step": 4211, + "time_per_iteration": 2.657390594482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072652, + "balance_loss_mlp": 1.05794144, + "epoch": 0.810311658330127, + "flos": 517327741440.0, + "grad_norm": 0.06333891813293195, + "language_loss": 0.86381185, + "learning_rate": 9.142914713252725e-05, + "loss": 0.87453836, + "num_input_tokens_seen": 349299760, + "router_z_loss_mlp": 0.14697266, + "step": 4212, + "time_per_iteration": 2.6212716102600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071222, + "balance_loss_mlp": 1.0565474, + "epoch": 0.8105040400153906, + "flos": 575782235136.0, + "grad_norm": 0.05862858167541506, + "language_loss": 0.84100783, + "learning_rate": 9.124964198950159e-05, + "loss": 0.85172009, + "num_input_tokens_seen": 349379712, + "router_z_loss_mlp": 0.1463623, + "step": 4213, + "time_per_iteration": 2.822678804397583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069022, + "balance_loss_mlp": 1.05432391, + "epoch": 0.8106964217006541, + "flos": 638963707392.0, + "grad_norm": 0.07655774671852761, + "language_loss": 0.85175037, + "learning_rate": 9.107029553743862e-05, + "loss": 0.86244059, + "num_input_tokens_seen": 349460320, + "router_z_loss_mlp": 0.14685059, + "step": 4214, + "time_per_iteration": 2.8445212841033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071466, + "balance_loss_mlp": 1.05682731, + "epoch": 0.8108888033859176, + "flos": 579505964544.0, + "grad_norm": 0.1237093586633983, + "language_loss": 0.81737274, + "learning_rate": 9.089110784596672e-05, + "loss": 0.82808745, + "num_input_tokens_seen": 349527648, + "router_z_loss_mlp": 0.14611816, + "step": 4215, + "time_per_iteration": 2.733858585357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068919, + "balance_loss_mlp": 1.05443513, + "epoch": 0.8110811850711812, + "flos": 559907209728.0, + "grad_norm": 0.07043550712901828, + "language_loss": 0.83526266, + "learning_rate": 9.071207898465284e-05, + "loss": 0.84595191, + "num_input_tokens_seen": 349606912, + "router_z_loss_mlp": 0.14477539, + "step": 4216, + "time_per_iteration": 2.795978546142578 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010059, + "balance_loss_mlp": 1.00371671, + "epoch": 0.8112735667564448, + "flos": 1517939979264.0, + "grad_norm": 0.007733492761232115, + "language_loss": 0.77260417, + "learning_rate": 9.053320902300205e-05, + "loss": 0.78270477, + "num_input_tokens_seen": 349827040, + "router_z_loss_mlp": 0.06347656, + "step": 4217, + "time_per_iteration": 4.671598672866821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073174, + "balance_loss_mlp": 1.0584631, + "epoch": 0.8114659484417084, + "flos": 616340897280.0, + "grad_norm": 0.09308711291624655, + "language_loss": 0.850631, + "learning_rate": 9.035449803045792e-05, + "loss": 0.86136276, + "num_input_tokens_seen": 349900080, + "router_z_loss_mlp": 0.14697266, + "step": 4218, + "time_per_iteration": 2.8252713680267334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070586, + "balance_loss_mlp": 1.05599451, + "epoch": 0.8116583301269719, + "flos": 649951340544.0, + "grad_norm": 0.06544053347412945, + "language_loss": 0.79116189, + "learning_rate": 9.017594607640211e-05, + "loss": 0.80186772, + "num_input_tokens_seen": 349983568, + "router_z_loss_mlp": 0.14562988, + "step": 4219, + "time_per_iteration": 3.0103390216827393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072332, + "balance_loss_mlp": 1.05731189, + "epoch": 0.8118507118122354, + "flos": 553087844352.0, + "grad_norm": 0.06754060213637747, + "language_loss": 0.80264437, + "learning_rate": 8.999755323015463e-05, + "loss": 0.81336772, + "num_input_tokens_seen": 350054928, + "router_z_loss_mlp": 0.14990234, + "step": 4220, + "time_per_iteration": 2.7641568183898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067935, + "balance_loss_mlp": 1.05295074, + "epoch": 0.812043093497499, + "flos": 544118819328.0, + "grad_norm": 0.08257930286833466, + "language_loss": 0.8756063, + "learning_rate": 8.981931956097384e-05, + "loss": 0.88628566, + "num_input_tokens_seen": 350127872, + "router_z_loss_mlp": 0.14953613, + "step": 4221, + "time_per_iteration": 2.646381139755249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107214, + "balance_loss_mlp": 1.05748951, + "epoch": 0.8122354751827626, + "flos": 583404788736.0, + "grad_norm": 0.06076540452447546, + "language_loss": 0.83228678, + "learning_rate": 8.964124513805628e-05, + "loss": 0.84300816, + "num_input_tokens_seen": 350206592, + "router_z_loss_mlp": 0.14624023, + "step": 4222, + "time_per_iteration": 2.7860500812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011046, + "balance_loss_mlp": 1.00465596, + "epoch": 0.8124278568680262, + "flos": 1530568120320.0, + "grad_norm": 0.007608284192397786, + "language_loss": 0.78250074, + "learning_rate": 8.94633300305363e-05, + "loss": 0.79261118, + "num_input_tokens_seen": 350436048, + "router_z_loss_mlp": 0.06396484, + "step": 4223, + "time_per_iteration": 4.9178102016448975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073494, + "balance_loss_mlp": 1.05906975, + "epoch": 0.8126202385532897, + "flos": 432865161216.0, + "grad_norm": 0.07270938351246994, + "language_loss": 0.79917443, + "learning_rate": 8.928557430748668e-05, + "loss": 0.80990934, + "num_input_tokens_seen": 350501376, + "router_z_loss_mlp": 0.14404297, + "step": 4224, + "time_per_iteration": 2.583998680114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010746, + "balance_loss_mlp": 1.00440443, + "epoch": 0.8128126202385533, + "flos": 1547905987584.0, + "grad_norm": 0.00790710761891799, + "language_loss": 0.76495624, + "learning_rate": 8.910797803791854e-05, + "loss": 0.77506375, + "num_input_tokens_seen": 350735232, + "router_z_loss_mlp": 0.06347656, + "step": 4225, + "time_per_iteration": 4.820707321166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070833, + "balance_loss_mlp": 1.05620575, + "epoch": 0.8130050019238169, + "flos": 528317945856.0, + "grad_norm": 0.06548779775423773, + "language_loss": 0.88866699, + "learning_rate": 8.893054129078077e-05, + "loss": 0.89937526, + "num_input_tokens_seen": 350805088, + "router_z_loss_mlp": 0.14624023, + "step": 4226, + "time_per_iteration": 2.636085271835327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071555, + "balance_loss_mlp": 1.05685687, + "epoch": 0.8131973836090804, + "flos": 543125481984.0, + "grad_norm": 0.08255855084993005, + "language_loss": 0.80108345, + "learning_rate": 8.875326413496037e-05, + "loss": 0.81179905, + "num_input_tokens_seen": 350876896, + "router_z_loss_mlp": 0.14685059, + "step": 4227, + "time_per_iteration": 2.8090357780456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107379, + "balance_loss_mlp": 1.05867422, + "epoch": 0.8133897652943439, + "flos": 576494019072.0, + "grad_norm": 0.08767577384778778, + "language_loss": 0.82186741, + "learning_rate": 8.857614663928249e-05, + "loss": 0.83260536, + "num_input_tokens_seen": 350948400, + "router_z_loss_mlp": 0.15087891, + "step": 4228, + "time_per_iteration": 2.7402915954589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078602, + "balance_loss_mlp": 1.06378388, + "epoch": 0.8135821469796075, + "flos": 579219268608.0, + "grad_norm": 0.07685929665227552, + "language_loss": 0.78881317, + "learning_rate": 8.839918887251025e-05, + "loss": 0.79959923, + "num_input_tokens_seen": 351023328, + "router_z_loss_mlp": 0.14794922, + "step": 4229, + "time_per_iteration": 2.7446353435516357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076808, + "balance_loss_mlp": 1.06232381, + "epoch": 0.8137745286648711, + "flos": 650346693120.0, + "grad_norm": 0.0759740537833267, + "language_loss": 0.83667004, + "learning_rate": 8.822239090334472e-05, + "loss": 0.8474381, + "num_input_tokens_seen": 351108672, + "router_z_loss_mlp": 0.14465332, + "step": 4230, + "time_per_iteration": 2.9547126293182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107277, + "balance_loss_mlp": 1.05783296, + "epoch": 0.8139669103501347, + "flos": 701888219136.0, + "grad_norm": 0.06626400468200025, + "language_loss": 0.7554509, + "learning_rate": 8.804575280042493e-05, + "loss": 0.76617861, + "num_input_tokens_seen": 351185056, + "router_z_loss_mlp": 0.14929199, + "step": 4231, + "time_per_iteration": 2.974144458770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080812, + "balance_loss_mlp": 1.06619692, + "epoch": 0.8141592920353983, + "flos": 650223355392.0, + "grad_norm": 0.08117031851913392, + "language_loss": 0.82810342, + "learning_rate": 8.786927463232774e-05, + "loss": 0.83891159, + "num_input_tokens_seen": 351255856, + "router_z_loss_mlp": 0.14587402, + "step": 4232, + "time_per_iteration": 2.828878164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078383, + "balance_loss_mlp": 1.06356478, + "epoch": 0.8143516737206618, + "flos": 536829949440.0, + "grad_norm": 0.07623472218938802, + "language_loss": 0.81033397, + "learning_rate": 8.769295646756853e-05, + "loss": 0.82111776, + "num_input_tokens_seen": 351322336, + "router_z_loss_mlp": 0.14794922, + "step": 4233, + "time_per_iteration": 2.61362361907959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076653, + "balance_loss_mlp": 1.06164443, + "epoch": 0.8145440554059253, + "flos": 508366056960.0, + "grad_norm": 0.08266771848864475, + "language_loss": 0.82275444, + "learning_rate": 8.751679837459963e-05, + "loss": 0.83352101, + "num_input_tokens_seen": 351387440, + "router_z_loss_mlp": 0.14978027, + "step": 4234, + "time_per_iteration": 2.5858421325683594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071303, + "balance_loss_mlp": 1.05697441, + "epoch": 0.8147364370911889, + "flos": 635032576512.0, + "grad_norm": 0.05785121947375422, + "language_loss": 0.86312371, + "learning_rate": 8.734080042181181e-05, + "loss": 0.87383676, + "num_input_tokens_seen": 351464192, + "router_z_loss_mlp": 0.14318848, + "step": 4235, + "time_per_iteration": 2.841019868850708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072756, + "balance_loss_mlp": 1.05809283, + "epoch": 0.8149288187764525, + "flos": 422801482752.0, + "grad_norm": 0.07694022174465051, + "language_loss": 0.78536922, + "learning_rate": 8.716496267753343e-05, + "loss": 0.7960968, + "num_input_tokens_seen": 351528016, + "router_z_loss_mlp": 0.14660645, + "step": 4236, + "time_per_iteration": 2.4641432762145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107039, + "balance_loss_mlp": 1.055632, + "epoch": 0.8151212004617161, + "flos": 597444014592.0, + "grad_norm": 0.07150966295546053, + "language_loss": 0.81421232, + "learning_rate": 8.698928521003097e-05, + "loss": 0.82491624, + "num_input_tokens_seen": 351601648, + "router_z_loss_mlp": 0.14733887, + "step": 4237, + "time_per_iteration": 2.7782487869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011063, + "balance_loss_mlp": 1.00476873, + "epoch": 0.8153135821469796, + "flos": 1479330915840.0, + "grad_norm": 0.006323287635293764, + "language_loss": 0.77852845, + "learning_rate": 8.681376808750835e-05, + "loss": 0.78863907, + "num_input_tokens_seen": 351826720, + "router_z_loss_mlp": 0.06298828, + "step": 4238, + "time_per_iteration": 4.983094930648804 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067411, + "balance_loss_mlp": 1.0524863, + "epoch": 0.8155059638322432, + "flos": 437097669120.0, + "grad_norm": 0.10825435127351188, + "language_loss": 0.82812446, + "learning_rate": 8.663841137810741e-05, + "loss": 0.83879864, + "num_input_tokens_seen": 351891760, + "router_z_loss_mlp": 0.14916992, + "step": 4239, + "time_per_iteration": 2.5248992443084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107273, + "balance_loss_mlp": 1.05785286, + "epoch": 0.8156983455175068, + "flos": 794390727168.0, + "grad_norm": 0.07546845306981396, + "language_loss": 0.85244554, + "learning_rate": 8.646321514990763e-05, + "loss": 0.86317283, + "num_input_tokens_seen": 351977504, + "router_z_loss_mlp": 0.14855957, + "step": 4240, + "time_per_iteration": 3.0435335636138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069391, + "balance_loss_mlp": 1.05460918, + "epoch": 0.8158907272027703, + "flos": 685986029568.0, + "grad_norm": 0.09379307453363464, + "language_loss": 0.81874454, + "learning_rate": 8.628817947092616e-05, + "loss": 0.82943839, + "num_input_tokens_seen": 352050176, + "router_z_loss_mlp": 0.14758301, + "step": 4241, + "time_per_iteration": 2.8032925128936768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069769, + "balance_loss_mlp": 1.05539286, + "epoch": 0.8160831088880338, + "flos": 487055213568.0, + "grad_norm": 0.08597604805020649, + "language_loss": 0.84047925, + "learning_rate": 8.611330440911797e-05, + "loss": 0.85117698, + "num_input_tokens_seen": 352116848, + "router_z_loss_mlp": 0.14367676, + "step": 4242, + "time_per_iteration": 2.5600948333740234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063929, + "balance_loss_mlp": 1.04928982, + "epoch": 0.8162754905732974, + "flos": 464872172544.0, + "grad_norm": 0.06850617145675146, + "language_loss": 0.80506492, + "learning_rate": 8.593859003237558e-05, + "loss": 0.81570411, + "num_input_tokens_seen": 352185056, + "router_z_loss_mlp": 0.14611816, + "step": 4243, + "time_per_iteration": 2.554950475692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01012818, + "balance_loss_mlp": 1.00652385, + "epoch": 0.816467872258561, + "flos": 1239530522112.0, + "grad_norm": 0.007477552534397375, + "language_loss": 0.75285125, + "learning_rate": 8.576403640852904e-05, + "loss": 0.76297939, + "num_input_tokens_seen": 352397648, + "router_z_loss_mlp": 0.06298828, + "step": 4244, + "time_per_iteration": 4.717959880828857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063982, + "balance_loss_mlp": 1.04943848, + "epoch": 0.8166602539438246, + "flos": 687169516032.0, + "grad_norm": 0.058268983296576836, + "language_loss": 0.86534429, + "learning_rate": 8.558964360534615e-05, + "loss": 0.87598407, + "num_input_tokens_seen": 352478272, + "router_z_loss_mlp": 0.14526367, + "step": 4245, + "time_per_iteration": 2.9267804622650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011178, + "balance_loss_mlp": 1.00488424, + "epoch": 0.8168526356290882, + "flos": 1490520807936.0, + "grad_norm": 0.00711959110755669, + "language_loss": 0.72974741, + "learning_rate": 8.541541169053219e-05, + "loss": 0.73985922, + "num_input_tokens_seen": 352707104, + "router_z_loss_mlp": 0.06298828, + "step": 4246, + "time_per_iteration": 4.947716951370239 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066889, + "balance_loss_mlp": 1.05230975, + "epoch": 0.8170450173143516, + "flos": 578201338368.0, + "grad_norm": 0.07220804796872216, + "language_loss": 0.8435545, + "learning_rate": 8.524134073172984e-05, + "loss": 0.85422337, + "num_input_tokens_seen": 352779248, + "router_z_loss_mlp": 0.14575195, + "step": 4247, + "time_per_iteration": 2.71348237991333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070044, + "balance_loss_mlp": 1.0554409, + "epoch": 0.8172373989996152, + "flos": 571275514368.0, + "grad_norm": 0.06401767096743954, + "language_loss": 0.84267342, + "learning_rate": 8.506743079651974e-05, + "loss": 0.85337389, + "num_input_tokens_seen": 352856784, + "router_z_loss_mlp": 0.14599609, + "step": 4248, + "time_per_iteration": 2.7759175300598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.04941392, + "epoch": 0.8174297806848788, + "flos": 528831866880.0, + "grad_norm": 0.08441006927366383, + "language_loss": 0.81059384, + "learning_rate": 8.489368195241948e-05, + "loss": 0.82123899, + "num_input_tokens_seen": 352926496, + "router_z_loss_mlp": 0.15075684, + "step": 4249, + "time_per_iteration": 2.687244176864624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106512, + "balance_loss_mlp": 1.05042124, + "epoch": 0.8176221623701424, + "flos": 569108602368.0, + "grad_norm": 0.06785328492638941, + "language_loss": 0.78955877, + "learning_rate": 8.47200942668846e-05, + "loss": 0.80021, + "num_input_tokens_seen": 353005312, + "router_z_loss_mlp": 0.14697266, + "step": 4250, + "time_per_iteration": 2.829185724258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066489, + "balance_loss_mlp": 1.05181456, + "epoch": 0.8178145440554059, + "flos": 656521459200.0, + "grad_norm": 0.07524856848543239, + "language_loss": 0.80325913, + "learning_rate": 8.454666780730735e-05, + "loss": 0.81392401, + "num_input_tokens_seen": 353085120, + "router_z_loss_mlp": 0.14660645, + "step": 4251, + "time_per_iteration": 2.8891162872314453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067615, + "balance_loss_mlp": 1.05297589, + "epoch": 0.8180069257406695, + "flos": 545924883456.0, + "grad_norm": 0.07928844694004242, + "language_loss": 0.87725914, + "learning_rate": 8.437340264101828e-05, + "loss": 0.88793522, + "num_input_tokens_seen": 353160992, + "router_z_loss_mlp": 0.14611816, + "step": 4252, + "time_per_iteration": 2.7597384452819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067221, + "balance_loss_mlp": 1.0522964, + "epoch": 0.818199307425933, + "flos": 619271350272.0, + "grad_norm": 0.08227515131076636, + "language_loss": 0.84713292, + "learning_rate": 8.420029883528474e-05, + "loss": 0.85780513, + "num_input_tokens_seen": 353233328, + "router_z_loss_mlp": 0.14904785, + "step": 4253, + "time_per_iteration": 2.727544069290161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068369, + "balance_loss_mlp": 1.05302691, + "epoch": 0.8183916891111966, + "flos": 647618872320.0, + "grad_norm": 0.08297238851209021, + "language_loss": 0.76718354, + "learning_rate": 8.402735645731157e-05, + "loss": 0.77786726, + "num_input_tokens_seen": 353310592, + "router_z_loss_mlp": 0.15319824, + "step": 4254, + "time_per_iteration": 2.9058609008789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066395, + "balance_loss_mlp": 1.05169678, + "epoch": 0.8185840707964602, + "flos": 499120247808.0, + "grad_norm": 0.07214603685273151, + "language_loss": 0.77970219, + "learning_rate": 8.385457557424098e-05, + "loss": 0.79036617, + "num_input_tokens_seen": 353376544, + "router_z_loss_mlp": 0.14685059, + "step": 4255, + "time_per_iteration": 2.618168830871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106315, + "balance_loss_mlp": 1.04854703, + "epoch": 0.8187764524817237, + "flos": 786229659648.0, + "grad_norm": 0.06559935606493841, + "language_loss": 0.79293621, + "learning_rate": 8.368195625315251e-05, + "loss": 0.80356765, + "num_input_tokens_seen": 353461200, + "router_z_loss_mlp": 0.14599609, + "step": 4256, + "time_per_iteration": 3.2203421592712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_mlp": 1.04709792, + "epoch": 0.8189688341669873, + "flos": 550710959616.0, + "grad_norm": 0.05824841247064268, + "language_loss": 0.80574787, + "learning_rate": 8.350949856106283e-05, + "loss": 0.81636846, + "num_input_tokens_seen": 353538608, + "router_z_loss_mlp": 0.14929199, + "step": 4257, + "time_per_iteration": 2.925502300262451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008141, + "balance_loss_mlp": 1.00179935, + "epoch": 0.8191612158522509, + "flos": 1351972435968.0, + "grad_norm": 0.005216238757074485, + "language_loss": 0.71149343, + "learning_rate": 8.333720256492599e-05, + "loss": 0.72157484, + "num_input_tokens_seen": 353766960, + "router_z_loss_mlp": 0.06347656, + "step": 4258, + "time_per_iteration": 4.837713241577148 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060434, + "balance_loss_mlp": 1.04580665, + "epoch": 0.8193535975375145, + "flos": 544257211392.0, + "grad_norm": 0.08204220791726961, + "language_loss": 0.83521521, + "learning_rate": 8.316506833163318e-05, + "loss": 0.84581947, + "num_input_tokens_seen": 353833552, + "router_z_loss_mlp": 0.14599609, + "step": 4259, + "time_per_iteration": 2.687384605407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067951, + "balance_loss_mlp": 1.05326462, + "epoch": 0.8195459792227779, + "flos": 865733266944.0, + "grad_norm": 0.057213289118123956, + "language_loss": 0.85745478, + "learning_rate": 8.299309592801297e-05, + "loss": 0.86813432, + "num_input_tokens_seen": 353915520, + "router_z_loss_mlp": 0.14660645, + "step": 4260, + "time_per_iteration": 3.1039042472839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066565, + "balance_loss_mlp": 1.05187869, + "epoch": 0.8197383609080415, + "flos": 569293982208.0, + "grad_norm": 0.08308709136286152, + "language_loss": 0.81558263, + "learning_rate": 8.282128542083101e-05, + "loss": 0.82624829, + "num_input_tokens_seen": 353992048, + "router_z_loss_mlp": 0.14672852, + "step": 4261, + "time_per_iteration": 2.7021541595458984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060256, + "balance_loss_mlp": 1.04535532, + "epoch": 0.8199307425933051, + "flos": 530813399040.0, + "grad_norm": 0.06915941438487261, + "language_loss": 0.85103023, + "learning_rate": 8.264963687678978e-05, + "loss": 0.86163288, + "num_input_tokens_seen": 354064848, + "router_z_loss_mlp": 0.14892578, + "step": 4262, + "time_per_iteration": 2.6805107593536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067164, + "balance_loss_mlp": 1.05225098, + "epoch": 0.8201231242785687, + "flos": 567070170624.0, + "grad_norm": 0.06623199585661957, + "language_loss": 0.84908283, + "learning_rate": 8.247815036252921e-05, + "loss": 0.85975444, + "num_input_tokens_seen": 354138848, + "router_z_loss_mlp": 0.14904785, + "step": 4263, + "time_per_iteration": 2.799445629119873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064375, + "balance_loss_mlp": 1.05036759, + "epoch": 0.8203155059638323, + "flos": 1230505717248.0, + "grad_norm": 0.06807936964108087, + "language_loss": 0.82982183, + "learning_rate": 8.230682594462652e-05, + "loss": 0.84046555, + "num_input_tokens_seen": 354227696, + "router_z_loss_mlp": 0.14038086, + "step": 4264, + "time_per_iteration": 3.5467734336853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066376, + "balance_loss_mlp": 1.05158246, + "epoch": 0.8205078876490958, + "flos": 574198626816.0, + "grad_norm": 0.06878665098349063, + "language_loss": 0.79854757, + "learning_rate": 8.213566368959558e-05, + "loss": 0.80921131, + "num_input_tokens_seen": 354298400, + "router_z_loss_mlp": 0.14770508, + "step": 4265, + "time_per_iteration": 2.6667027473449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068665, + "balance_loss_mlp": 1.05419254, + "epoch": 0.8207002693343594, + "flos": 931400280576.0, + "grad_norm": 0.07205474863641972, + "language_loss": 0.77937365, + "learning_rate": 8.196466366388744e-05, + "loss": 0.79006028, + "num_input_tokens_seen": 354385024, + "router_z_loss_mlp": 0.14465332, + "step": 4266, + "time_per_iteration": 3.2075653076171875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068905, + "balance_loss_mlp": 1.05440879, + "epoch": 0.8208926510196229, + "flos": 549571889664.0, + "grad_norm": 0.06576276749924337, + "language_loss": 0.80342031, + "learning_rate": 8.179382593389029e-05, + "loss": 0.81410939, + "num_input_tokens_seen": 354456384, + "router_z_loss_mlp": 0.14501953, + "step": 4267, + "time_per_iteration": 2.6763927936553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070444, + "balance_loss_mlp": 1.0557574, + "epoch": 0.8210850327048865, + "flos": 648182352384.0, + "grad_norm": 0.058242671998823256, + "language_loss": 0.8210336, + "learning_rate": 8.162315056592918e-05, + "loss": 0.83173811, + "num_input_tokens_seen": 354531296, + "router_z_loss_mlp": 0.14672852, + "step": 4268, + "time_per_iteration": 2.861537456512451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066951, + "balance_loss_mlp": 1.05231237, + "epoch": 0.82127741439015, + "flos": 601520878080.0, + "grad_norm": 0.09144521431172725, + "language_loss": 0.81410992, + "learning_rate": 8.145263762626615e-05, + "loss": 0.82477945, + "num_input_tokens_seen": 354605680, + "router_z_loss_mlp": 0.14611816, + "step": 4269, + "time_per_iteration": 2.832193613052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064735, + "balance_loss_mlp": 1.05038261, + "epoch": 0.8214697960754136, + "flos": 474831963648.0, + "grad_norm": 0.07129897215411395, + "language_loss": 0.83495176, + "learning_rate": 8.128228718110015e-05, + "loss": 0.84559911, + "num_input_tokens_seen": 354678160, + "router_z_loss_mlp": 0.14355469, + "step": 4270, + "time_per_iteration": 2.7007412910461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070918, + "balance_loss_mlp": 1.05664861, + "epoch": 0.8216621777606772, + "flos": 903648172032.0, + "grad_norm": 0.10714973214650605, + "language_loss": 0.84790981, + "learning_rate": 8.11120992965671e-05, + "loss": 0.85861897, + "num_input_tokens_seen": 354751024, + "router_z_loss_mlp": 0.1427002, + "step": 4271, + "time_per_iteration": 3.086967945098877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070644, + "balance_loss_mlp": 1.05581439, + "epoch": 0.8218545594459408, + "flos": 514461528576.0, + "grad_norm": 0.09850812863840513, + "language_loss": 0.82123983, + "learning_rate": 8.094207403873998e-05, + "loss": 0.83194625, + "num_input_tokens_seen": 354819408, + "router_z_loss_mlp": 0.14819336, + "step": 4272, + "time_per_iteration": 2.6546895503997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066366, + "balance_loss_mlp": 1.05183411, + "epoch": 0.8220469411312044, + "flos": 494536803840.0, + "grad_norm": 0.08038507923953937, + "language_loss": 0.86432809, + "learning_rate": 8.077221147362829e-05, + "loss": 0.87499177, + "num_input_tokens_seen": 354887376, + "router_z_loss_mlp": 0.14526367, + "step": 4273, + "time_per_iteration": 2.6141135692596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106927, + "balance_loss_mlp": 1.05478621, + "epoch": 0.8222393228164678, + "flos": 386433483264.0, + "grad_norm": 0.09203044891172038, + "language_loss": 0.8956039, + "learning_rate": 8.060251166717835e-05, + "loss": 0.90629661, + "num_input_tokens_seen": 354948288, + "router_z_loss_mlp": 0.14477539, + "step": 4274, + "time_per_iteration": 2.462885618209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070861, + "balance_loss_mlp": 1.05579329, + "epoch": 0.8224317045017314, + "flos": 536590241280.0, + "grad_norm": 0.062175194099720756, + "language_loss": 0.86843693, + "learning_rate": 8.043297468527383e-05, + "loss": 0.8791455, + "num_input_tokens_seen": 355016912, + "router_z_loss_mlp": 0.1505127, + "step": 4275, + "time_per_iteration": 2.687908172607422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066529, + "balance_loss_mlp": 1.05210471, + "epoch": 0.822624086186995, + "flos": 554899051008.0, + "grad_norm": 0.07402291421555263, + "language_loss": 0.82578254, + "learning_rate": 8.02636005937346e-05, + "loss": 0.83644789, + "num_input_tokens_seen": 355085936, + "router_z_loss_mlp": 0.14416504, + "step": 4276, + "time_per_iteration": 2.653111696243286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063808, + "balance_loss_mlp": 1.04871583, + "epoch": 0.8228164678722586, + "flos": 539579791872.0, + "grad_norm": 0.06446226199945072, + "language_loss": 0.79937363, + "learning_rate": 8.009438945831771e-05, + "loss": 0.81001174, + "num_input_tokens_seen": 355161984, + "router_z_loss_mlp": 0.15075684, + "step": 4277, + "time_per_iteration": 2.774325132369995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067009, + "balance_loss_mlp": 1.05238247, + "epoch": 0.8230088495575221, + "flos": 473253124608.0, + "grad_norm": 0.06473508268718137, + "language_loss": 0.79103273, + "learning_rate": 7.992534134471641e-05, + "loss": 0.80170286, + "num_input_tokens_seen": 355234544, + "router_z_loss_mlp": 0.14599609, + "step": 4278, + "time_per_iteration": 2.653775930404663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066815, + "balance_loss_mlp": 1.0519377, + "epoch": 0.8232012312427857, + "flos": 591672314880.0, + "grad_norm": 0.09210301400627263, + "language_loss": 0.82811761, + "learning_rate": 7.975645631856127e-05, + "loss": 0.83878583, + "num_input_tokens_seen": 355302896, + "router_z_loss_mlp": 0.14855957, + "step": 4279, + "time_per_iteration": 2.6823325157165527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065231, + "balance_loss_mlp": 1.05017543, + "epoch": 0.8233936129280492, + "flos": 572644380672.0, + "grad_norm": 0.06658303905953458, + "language_loss": 0.74463868, + "learning_rate": 7.958773444541916e-05, + "loss": 0.75529099, + "num_input_tokens_seen": 355377040, + "router_z_loss_mlp": 0.15026855, + "step": 4280, + "time_per_iteration": 2.7801764011383057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066907, + "balance_loss_mlp": 1.05257797, + "epoch": 0.8235859946133128, + "flos": 731337735168.0, + "grad_norm": 0.0690373463225978, + "language_loss": 0.78400791, + "learning_rate": 7.941917579079383e-05, + "loss": 0.79467702, + "num_input_tokens_seen": 355461616, + "router_z_loss_mlp": 0.14343262, + "step": 4281, + "time_per_iteration": 3.060039520263672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066701, + "balance_loss_mlp": 1.05222869, + "epoch": 0.8237783762985764, + "flos": 570314483712.0, + "grad_norm": 0.07396215157678351, + "language_loss": 0.81395936, + "learning_rate": 7.92507804201253e-05, + "loss": 0.82462645, + "num_input_tokens_seen": 355532480, + "router_z_loss_mlp": 0.14453125, + "step": 4282, + "time_per_iteration": 2.808473587036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007879, + "balance_loss_mlp": 1.00144207, + "epoch": 0.8239707579838399, + "flos": 1466232897024.0, + "grad_norm": 0.005158334115964225, + "language_loss": 0.75297678, + "learning_rate": 7.908254839879092e-05, + "loss": 0.76305556, + "num_input_tokens_seen": 355768752, + "router_z_loss_mlp": 0.06445312, + "step": 4283, + "time_per_iteration": 4.95106315612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065977, + "balance_loss_mlp": 1.05111217, + "epoch": 0.8241631396691035, + "flos": 467313297408.0, + "grad_norm": 0.07378682301841104, + "language_loss": 0.80314898, + "learning_rate": 7.89144797921037e-05, + "loss": 0.81380886, + "num_input_tokens_seen": 355838800, + "router_z_loss_mlp": 0.14855957, + "step": 4284, + "time_per_iteration": 2.7099735736846924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007982, + "balance_loss_mlp": 1.00154495, + "epoch": 0.8243555213543671, + "flos": 1539426290688.0, + "grad_norm": 0.005169205601206867, + "language_loss": 0.77934271, + "learning_rate": 7.874657466531388e-05, + "loss": 0.78942251, + "num_input_tokens_seen": 356069280, + "router_z_loss_mlp": 0.06445312, + "step": 4285, + "time_per_iteration": 4.925944089889526 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065516, + "balance_loss_mlp": 1.05109131, + "epoch": 0.8245479030396307, + "flos": 797429836800.0, + "grad_norm": 0.07826077018857239, + "language_loss": 0.82661068, + "learning_rate": 7.85788330836078e-05, + "loss": 0.83726579, + "num_input_tokens_seen": 356164528, + "router_z_loss_mlp": 0.14428711, + "step": 4286, + "time_per_iteration": 3.1125218868255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068802, + "balance_loss_mlp": 1.05397248, + "epoch": 0.8247402847248941, + "flos": 646114185216.0, + "grad_norm": 0.06888456798344761, + "language_loss": 0.76415771, + "learning_rate": 7.841125511210878e-05, + "loss": 0.77484572, + "num_input_tokens_seen": 356243600, + "router_z_loss_mlp": 0.14807129, + "step": 4287, + "time_per_iteration": 2.893860101699829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068506, + "balance_loss_mlp": 1.05348611, + "epoch": 0.8249326664101577, + "flos": 604421595648.0, + "grad_norm": 0.06182947496579068, + "language_loss": 0.79757684, + "learning_rate": 7.824384081587637e-05, + "loss": 0.80826187, + "num_input_tokens_seen": 356320320, + "router_z_loss_mlp": 0.15002441, + "step": 4288, + "time_per_iteration": 2.8073134422302246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010702, + "balance_loss_mlp": 1.0554297, + "epoch": 0.8251250480954213, + "flos": 824369218560.0, + "grad_norm": 0.08909700338283992, + "language_loss": 0.86458504, + "learning_rate": 7.807659025990637e-05, + "loss": 0.87528706, + "num_input_tokens_seen": 356406928, + "router_z_loss_mlp": 0.14746094, + "step": 4289, + "time_per_iteration": 3.1265206336975098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_mlp": 1.04926586, + "epoch": 0.8253174297806849, + "flos": 757382897664.0, + "grad_norm": 0.07722312051706566, + "language_loss": 0.78082144, + "learning_rate": 7.790950350913112e-05, + "loss": 0.7914592, + "num_input_tokens_seen": 356481456, + "router_z_loss_mlp": 0.1451416, + "step": 4290, + "time_per_iteration": 2.919142246246338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070465, + "balance_loss_mlp": 1.05595672, + "epoch": 0.8255098114659485, + "flos": 794469648384.0, + "grad_norm": 0.06822496505203762, + "language_loss": 0.87448025, + "learning_rate": 7.774258062841971e-05, + "loss": 0.88518488, + "num_input_tokens_seen": 356568736, + "router_z_loss_mlp": 0.14489746, + "step": 4291, + "time_per_iteration": 3.1968111991882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063883, + "balance_loss_mlp": 1.04919672, + "epoch": 0.825702193151212, + "flos": 710417475072.0, + "grad_norm": 0.11140980724884261, + "language_loss": 0.77158391, + "learning_rate": 7.757582168257731e-05, + "loss": 0.78222275, + "num_input_tokens_seen": 356643328, + "router_z_loss_mlp": 0.14672852, + "step": 4292, + "time_per_iteration": 2.864590883255005 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064183, + "balance_loss_mlp": 1.04980659, + "epoch": 0.8258945748364755, + "flos": 683394029568.0, + "grad_norm": 0.07528844179366555, + "language_loss": 0.80776614, + "learning_rate": 7.740922673634537e-05, + "loss": 0.81840801, + "num_input_tokens_seen": 356723824, + "router_z_loss_mlp": 0.14379883, + "step": 4293, + "time_per_iteration": 2.9787964820861816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069315, + "balance_loss_mlp": 1.05462837, + "epoch": 0.8260869565217391, + "flos": 594563120640.0, + "grad_norm": 0.07232173047564831, + "language_loss": 0.78854036, + "learning_rate": 7.724279585440186e-05, + "loss": 0.79923344, + "num_input_tokens_seen": 356796512, + "router_z_loss_mlp": 0.14660645, + "step": 4294, + "time_per_iteration": 2.737032175064087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063563, + "balance_loss_mlp": 1.04878104, + "epoch": 0.8262793382070027, + "flos": 651480993792.0, + "grad_norm": 0.11834626543573872, + "language_loss": 0.85043526, + "learning_rate": 7.707652910136098e-05, + "loss": 0.86107087, + "num_input_tokens_seen": 356868624, + "router_z_loss_mlp": 0.14758301, + "step": 4295, + "time_per_iteration": 2.7672622203826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067425, + "balance_loss_mlp": 1.05227339, + "epoch": 0.8264717198922663, + "flos": 538922709504.0, + "grad_norm": 0.07320373612786368, + "language_loss": 0.85068297, + "learning_rate": 7.691042654177315e-05, + "loss": 0.86135721, + "num_input_tokens_seen": 356934368, + "router_z_loss_mlp": 0.15136719, + "step": 4296, + "time_per_iteration": 2.727430820465088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067722, + "balance_loss_mlp": 1.05339313, + "epoch": 0.8266641015775298, + "flos": 538949873664.0, + "grad_norm": 0.08277618732225704, + "language_loss": 0.75727075, + "learning_rate": 7.674448824012514e-05, + "loss": 0.76794797, + "num_input_tokens_seen": 357005536, + "router_z_loss_mlp": 0.14331055, + "step": 4297, + "time_per_iteration": 2.6567587852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068847, + "balance_loss_mlp": 1.0540297, + "epoch": 0.8268564832627934, + "flos": 585361728000.0, + "grad_norm": 0.0640063597091925, + "language_loss": 0.83917528, + "learning_rate": 7.657871426083979e-05, + "loss": 0.84986377, + "num_input_tokens_seen": 357082160, + "router_z_loss_mlp": 0.14794922, + "step": 4298, + "time_per_iteration": 2.7982728481292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_mlp": 1.0529747, + "epoch": 0.827048864948057, + "flos": 430661173248.0, + "grad_norm": 0.0794186132350224, + "language_loss": 0.84216493, + "learning_rate": 7.641310466827667e-05, + "loss": 0.85284323, + "num_input_tokens_seen": 357146928, + "router_z_loss_mlp": 0.14831543, + "step": 4299, + "time_per_iteration": 2.479512929916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062096, + "balance_loss_mlp": 1.04768384, + "epoch": 0.8272412466333205, + "flos": 1388430761472.0, + "grad_norm": 0.07007017286970613, + "language_loss": 0.84912431, + "learning_rate": 7.624765952673069e-05, + "loss": 0.85974526, + "num_input_tokens_seen": 357236768, + "router_z_loss_mlp": 0.14379883, + "step": 4300, + "time_per_iteration": 3.7502307891845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065217, + "balance_loss_mlp": 1.05036318, + "epoch": 0.827433628318584, + "flos": 538230749184.0, + "grad_norm": 0.07093314756635549, + "language_loss": 0.82853031, + "learning_rate": 7.608237890043335e-05, + "loss": 0.8391825, + "num_input_tokens_seen": 357307568, + "router_z_loss_mlp": 0.1484375, + "step": 4301, + "time_per_iteration": 2.697632312774658 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061301, + "balance_loss_mlp": 1.04642415, + "epoch": 0.8276260100038476, + "flos": 730734981120.0, + "grad_norm": 0.067461781222512, + "language_loss": 0.77062577, + "learning_rate": 7.59172628535526e-05, + "loss": 0.78123879, + "num_input_tokens_seen": 357387712, + "router_z_loss_mlp": 0.14855957, + "step": 4302, + "time_per_iteration": 2.9730281829833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069537, + "balance_loss_mlp": 1.05501771, + "epoch": 0.8278183916891112, + "flos": 871102273536.0, + "grad_norm": 0.0590298560947334, + "language_loss": 0.82669955, + "learning_rate": 7.575231145019196e-05, + "loss": 0.83739495, + "num_input_tokens_seen": 357473360, + "router_z_loss_mlp": 0.14501953, + "step": 4303, + "time_per_iteration": 3.175729274749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066264, + "balance_loss_mlp": 1.05120802, + "epoch": 0.8280107733743748, + "flos": 594543297024.0, + "grad_norm": 0.05865024398378704, + "language_loss": 0.77674329, + "learning_rate": 7.558752475439134e-05, + "loss": 0.78740591, + "num_input_tokens_seen": 357548432, + "router_z_loss_mlp": 0.15039062, + "step": 4304, + "time_per_iteration": 2.7806692123413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071171, + "balance_loss_mlp": 1.05629373, + "epoch": 0.8282031550596384, + "flos": 768607667712.0, + "grad_norm": 0.06803152802988026, + "language_loss": 0.84490967, + "learning_rate": 7.542290283012653e-05, + "loss": 0.8556214, + "num_input_tokens_seen": 357625968, + "router_z_loss_mlp": 0.14868164, + "step": 4305, + "time_per_iteration": 3.042027711868286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065414, + "balance_loss_mlp": 1.05056047, + "epoch": 0.8283955367449019, + "flos": 696108805632.0, + "grad_norm": 0.07027931411926491, + "language_loss": 0.77876532, + "learning_rate": 7.525844574130947e-05, + "loss": 0.78941941, + "num_input_tokens_seen": 357705824, + "router_z_loss_mlp": 0.14831543, + "step": 4306, + "time_per_iteration": 3.0258917808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067598, + "balance_loss_mlp": 1.0529592, + "epoch": 0.8285879184301654, + "flos": 660630256128.0, + "grad_norm": 0.06295078199718337, + "language_loss": 0.82822084, + "learning_rate": 7.509415355178806e-05, + "loss": 0.83889681, + "num_input_tokens_seen": 357787040, + "router_z_loss_mlp": 0.14611816, + "step": 4307, + "time_per_iteration": 2.9383127689361572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067189, + "balance_loss_mlp": 1.05246627, + "epoch": 0.828780300115429, + "flos": 558709042176.0, + "grad_norm": 0.08802993540008418, + "language_loss": 0.77530718, + "learning_rate": 7.493002632534618e-05, + "loss": 0.78597909, + "num_input_tokens_seen": 357856960, + "router_z_loss_mlp": 0.14709473, + "step": 4308, + "time_per_iteration": 2.690993547439575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066796, + "balance_loss_mlp": 1.05171561, + "epoch": 0.8289726818006926, + "flos": 830963930112.0, + "grad_norm": 0.07318550442475504, + "language_loss": 0.82053602, + "learning_rate": 7.476606412570352e-05, + "loss": 0.83120394, + "num_input_tokens_seen": 357937760, + "router_z_loss_mlp": 0.15063477, + "step": 4309, + "time_per_iteration": 3.061457872390747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068785, + "balance_loss_mlp": 1.05449212, + "epoch": 0.8291650634859561, + "flos": 732289227264.0, + "grad_norm": 0.10021622774197819, + "language_loss": 0.80771077, + "learning_rate": 7.460226701651624e-05, + "loss": 0.81839859, + "num_input_tokens_seen": 358012480, + "router_z_loss_mlp": 0.1427002, + "step": 4310, + "time_per_iteration": 2.9217689037323 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069318, + "balance_loss_mlp": 1.0544883, + "epoch": 0.8293574451712197, + "flos": 860910114816.0, + "grad_norm": 0.07749506282182811, + "language_loss": 0.8143084, + "learning_rate": 7.443863506137566e-05, + "loss": 0.82500154, + "num_input_tokens_seen": 358100720, + "router_z_loss_mlp": 0.14807129, + "step": 4311, + "time_per_iteration": 3.2195286750793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106071, + "balance_loss_mlp": 1.04607165, + "epoch": 0.8295498268564833, + "flos": 495156810240.0, + "grad_norm": 0.055714920992617885, + "language_loss": 0.81537104, + "learning_rate": 7.427516832380948e-05, + "loss": 0.8259781, + "num_input_tokens_seen": 358180496, + "router_z_loss_mlp": 0.14611816, + "step": 4312, + "time_per_iteration": 2.894439220428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070264, + "balance_loss_mlp": 1.05601811, + "epoch": 0.8297422085417469, + "flos": 554471391744.0, + "grad_norm": 0.06262478668438266, + "language_loss": 0.77979529, + "learning_rate": 7.4111866867281e-05, + "loss": 0.7904979, + "num_input_tokens_seen": 358261104, + "router_z_loss_mlp": 0.14233398, + "step": 4313, + "time_per_iteration": 2.79099440574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106636, + "balance_loss_mlp": 1.0517329, + "epoch": 0.8299345902270104, + "flos": 1247497417728.0, + "grad_norm": 0.07618762117958246, + "language_loss": 0.77313519, + "learning_rate": 7.39487307551896e-05, + "loss": 0.78379875, + "num_input_tokens_seen": 358356368, + "router_z_loss_mlp": 0.14624023, + "step": 4314, + "time_per_iteration": 3.6615333557128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072423, + "balance_loss_mlp": 1.05779576, + "epoch": 0.8301269719122739, + "flos": 585260411904.0, + "grad_norm": 0.18397477745179813, + "language_loss": 0.82993805, + "learning_rate": 7.378576005087034e-05, + "loss": 0.8406623, + "num_input_tokens_seen": 358429104, + "router_z_loss_mlp": 0.14611816, + "step": 4315, + "time_per_iteration": 2.8126580715179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072538, + "balance_loss_mlp": 1.05780363, + "epoch": 0.8303193535975375, + "flos": 509732352000.0, + "grad_norm": 0.08855740032604588, + "language_loss": 0.84620678, + "learning_rate": 7.362295481759412e-05, + "loss": 0.8569321, + "num_input_tokens_seen": 358501344, + "router_z_loss_mlp": 0.1472168, + "step": 4316, + "time_per_iteration": 2.6704373359680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071184, + "balance_loss_mlp": 1.05666399, + "epoch": 0.8305117352828011, + "flos": 580652375040.0, + "grad_norm": 0.0829439330873515, + "language_loss": 0.8352679, + "learning_rate": 7.346031511856722e-05, + "loss": 0.84597969, + "num_input_tokens_seen": 358575584, + "router_z_loss_mlp": 0.14526367, + "step": 4317, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068221, + "balance_loss_mlp": 1.05336761, + "epoch": 0.8307041169680647, + "flos": 481626736128.0, + "grad_norm": 0.07562403040012457, + "language_loss": 0.78876424, + "learning_rate": 7.329784101693232e-05, + "loss": 0.7994464, + "num_input_tokens_seen": 358644304, + "router_z_loss_mlp": 0.14831543, + "step": 4318, + "time_per_iteration": 2.674924373626709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072061, + "balance_loss_mlp": 1.05757725, + "epoch": 0.8308964986533282, + "flos": 624605852160.0, + "grad_norm": 0.17247227675142032, + "language_loss": 0.82843518, + "learning_rate": 7.313553257576727e-05, + "loss": 0.83915579, + "num_input_tokens_seen": 358712384, + "router_z_loss_mlp": 0.14465332, + "step": 4319, + "time_per_iteration": 2.780308723449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107238, + "balance_loss_mlp": 1.0574789, + "epoch": 0.8310888803385917, + "flos": 827319495168.0, + "grad_norm": 0.07679767527195203, + "language_loss": 0.78869575, + "learning_rate": 7.297338985808589e-05, + "loss": 0.79941958, + "num_input_tokens_seen": 358789264, + "router_z_loss_mlp": 0.14880371, + "step": 4320, + "time_per_iteration": 3.001223087310791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.06415725, + "epoch": 0.8312812620238553, + "flos": 583743241728.0, + "grad_norm": 0.07050095475557064, + "language_loss": 0.8173933, + "learning_rate": 7.281141292683746e-05, + "loss": 0.8281799, + "num_input_tokens_seen": 358868976, + "router_z_loss_mlp": 0.14501953, + "step": 4321, + "time_per_iteration": 2.8004937171936035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077036, + "balance_loss_mlp": 1.06259966, + "epoch": 0.8314736437091189, + "flos": 1115605052928.0, + "grad_norm": 0.07881560078697845, + "language_loss": 0.74389625, + "learning_rate": 7.26496018449071e-05, + "loss": 0.75466657, + "num_input_tokens_seen": 358953600, + "router_z_loss_mlp": 0.14428711, + "step": 4322, + "time_per_iteration": 3.427699565887451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075197, + "balance_loss_mlp": 1.06103539, + "epoch": 0.8316660253943825, + "flos": 517547625984.0, + "grad_norm": 0.07598389174722883, + "language_loss": 0.81921697, + "learning_rate": 7.248795667511543e-05, + "loss": 0.82996899, + "num_input_tokens_seen": 359028768, + "router_z_loss_mlp": 0.14172363, + "step": 4323, + "time_per_iteration": 2.7954294681549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076668, + "balance_loss_mlp": 1.06266105, + "epoch": 0.831858407079646, + "flos": 795329736192.0, + "grad_norm": 0.09151709224743419, + "language_loss": 0.7770648, + "learning_rate": 7.232647748021864e-05, + "loss": 0.78783149, + "num_input_tokens_seen": 359116208, + "router_z_loss_mlp": 0.14025879, + "step": 4324, + "time_per_iteration": 3.0391266345977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107714, + "balance_loss_mlp": 1.06284642, + "epoch": 0.8320507887649096, + "flos": 549967242240.0, + "grad_norm": 0.0787637779106886, + "language_loss": 0.83117342, + "learning_rate": 7.216516432290843e-05, + "loss": 0.84194481, + "num_input_tokens_seen": 359189552, + "router_z_loss_mlp": 0.14282227, + "step": 4325, + "time_per_iteration": 2.715921640396118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077349, + "balance_loss_mlp": 1.06300807, + "epoch": 0.8322431704501732, + "flos": 479398155264.0, + "grad_norm": 0.07342050905632894, + "language_loss": 0.82109582, + "learning_rate": 7.20040172658123e-05, + "loss": 0.8318693, + "num_input_tokens_seen": 359253008, + "router_z_loss_mlp": 0.14331055, + "step": 4326, + "time_per_iteration": 2.5846447944641113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079059, + "balance_loss_mlp": 1.0648613, + "epoch": 0.8324355521354367, + "flos": 572434407936.0, + "grad_norm": 0.08685227783658636, + "language_loss": 0.85463101, + "learning_rate": 7.184303637149308e-05, + "loss": 0.86542159, + "num_input_tokens_seen": 359326368, + "router_z_loss_mlp": 0.14208984, + "step": 4327, + "time_per_iteration": 2.7091739177703857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080456, + "balance_loss_mlp": 1.066401, + "epoch": 0.8326279338207002, + "flos": 503454071808.0, + "grad_norm": 0.06549925067141421, + "language_loss": 0.82185209, + "learning_rate": 7.168222170244888e-05, + "loss": 0.83265662, + "num_input_tokens_seen": 359394192, + "router_z_loss_mlp": 0.14050293, + "step": 4328, + "time_per_iteration": 2.5954463481903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079519, + "balance_loss_mlp": 1.06559563, + "epoch": 0.8328203155059638, + "flos": 605743474176.0, + "grad_norm": 0.06612642563497466, + "language_loss": 0.80887103, + "learning_rate": 7.152157332111364e-05, + "loss": 0.81966615, + "num_input_tokens_seen": 359476016, + "router_z_loss_mlp": 0.13928223, + "step": 4329, + "time_per_iteration": 2.91013240814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078449, + "balance_loss_mlp": 1.06431079, + "epoch": 0.8330126971912274, + "flos": 697798872576.0, + "grad_norm": 0.07964779047842838, + "language_loss": 0.85973161, + "learning_rate": 7.136109128985663e-05, + "loss": 0.87051612, + "num_input_tokens_seen": 359554048, + "router_z_loss_mlp": 0.14147949, + "step": 4330, + "time_per_iteration": 2.9252800941467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083477, + "balance_loss_mlp": 1.06949401, + "epoch": 0.833205078876491, + "flos": 494042706432.0, + "grad_norm": 0.07334420274354847, + "language_loss": 0.86698532, + "learning_rate": 7.120077567098249e-05, + "loss": 0.87782007, + "num_input_tokens_seen": 359621440, + "router_z_loss_mlp": 0.13977051, + "step": 4331, + "time_per_iteration": 2.65694522857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107354, + "balance_loss_mlp": 1.0595088, + "epoch": 0.8333974605617546, + "flos": 482812793856.0, + "grad_norm": 0.0626600317816662, + "language_loss": 0.82693064, + "learning_rate": 7.104062652673115e-05, + "loss": 0.83766603, + "num_input_tokens_seen": 359690320, + "router_z_loss_mlp": 0.14038086, + "step": 4332, + "time_per_iteration": 2.7553460597991943 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082446, + "balance_loss_mlp": 1.06805778, + "epoch": 0.833589842247018, + "flos": 686821151232.0, + "grad_norm": 0.08703524611699036, + "language_loss": 0.82555664, + "learning_rate": 7.088064391927818e-05, + "loss": 0.83638108, + "num_input_tokens_seen": 359759888, + "router_z_loss_mlp": 0.1439209, + "step": 4333, + "time_per_iteration": 2.828909397125244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107505, + "balance_loss_mlp": 1.06068492, + "epoch": 0.8337822239322816, + "flos": 881739343872.0, + "grad_norm": 0.0819256687419709, + "language_loss": 0.8264882, + "learning_rate": 7.072082791073419e-05, + "loss": 0.83723867, + "num_input_tokens_seen": 359836544, + "router_z_loss_mlp": 0.14367676, + "step": 4334, + "time_per_iteration": 3.081200361251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077915, + "balance_loss_mlp": 1.06354976, + "epoch": 0.8339746056175452, + "flos": 497183132160.0, + "grad_norm": 0.06916085041313896, + "language_loss": 0.82657623, + "learning_rate": 7.056117856314531e-05, + "loss": 0.83735543, + "num_input_tokens_seen": 359903024, + "router_z_loss_mlp": 0.14355469, + "step": 4335, + "time_per_iteration": 2.6069602966308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079649, + "balance_loss_mlp": 1.06497395, + "epoch": 0.8341669873028088, + "flos": 510495892992.0, + "grad_norm": 0.13056988952609092, + "language_loss": 0.86229324, + "learning_rate": 7.040169593849289e-05, + "loss": 0.87308979, + "num_input_tokens_seen": 359971200, + "router_z_loss_mlp": 0.14660645, + "step": 4336, + "time_per_iteration": 2.591599225997925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074785, + "balance_loss_mlp": 1.06100416, + "epoch": 0.8343593689880723, + "flos": 692321209344.0, + "grad_norm": 0.13647106986897586, + "language_loss": 0.84314466, + "learning_rate": 7.024238009869366e-05, + "loss": 0.85389245, + "num_input_tokens_seen": 360042560, + "router_z_loss_mlp": 0.13818359, + "step": 4337, + "time_per_iteration": 2.8121745586395264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074468, + "balance_loss_mlp": 1.06028175, + "epoch": 0.8345517506733359, + "flos": 552408367104.0, + "grad_norm": 0.07949021649042014, + "language_loss": 0.78494132, + "learning_rate": 7.008323110559956e-05, + "loss": 0.79568601, + "num_input_tokens_seen": 360118048, + "router_z_loss_mlp": 0.14172363, + "step": 4338, + "time_per_iteration": 2.7921485900878906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074509, + "balance_loss_mlp": 1.05995309, + "epoch": 0.8347441323585995, + "flos": 592052613120.0, + "grad_norm": 0.08099810824139689, + "language_loss": 0.76340652, + "learning_rate": 6.992424902099754e-05, + "loss": 0.77415156, + "num_input_tokens_seen": 360192528, + "router_z_loss_mlp": 0.14550781, + "step": 4339, + "time_per_iteration": 2.8709957599639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077425, + "balance_loss_mlp": 1.06347775, + "epoch": 0.834936514043863, + "flos": 614917702656.0, + "grad_norm": 0.06378238002097801, + "language_loss": 0.84410638, + "learning_rate": 6.976543390660983e-05, + "loss": 0.85488063, + "num_input_tokens_seen": 360266880, + "router_z_loss_mlp": 0.13964844, + "step": 4340, + "time_per_iteration": 2.763047456741333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074413, + "balance_loss_mlp": 1.06045377, + "epoch": 0.8351288957291266, + "flos": 467844470784.0, + "grad_norm": 0.08868106733466218, + "language_loss": 0.79730743, + "learning_rate": 6.960678582409424e-05, + "loss": 0.80805159, + "num_input_tokens_seen": 360336336, + "router_z_loss_mlp": 0.13964844, + "step": 4341, + "time_per_iteration": 2.5980849266052246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076452, + "balance_loss_mlp": 1.06226587, + "epoch": 0.8353212774143901, + "flos": 509319747072.0, + "grad_norm": 0.0623240268119806, + "language_loss": 0.78920925, + "learning_rate": 6.944830483504328e-05, + "loss": 0.79997373, + "num_input_tokens_seen": 360409776, + "router_z_loss_mlp": 0.14196777, + "step": 4342, + "time_per_iteration": 2.643486261367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070423, + "balance_loss_mlp": 1.05589139, + "epoch": 0.8355136590996537, + "flos": 687784753152.0, + "grad_norm": 0.06845892903357542, + "language_loss": 0.80452394, + "learning_rate": 6.928999100098483e-05, + "loss": 0.81522822, + "num_input_tokens_seen": 360486800, + "router_z_loss_mlp": 0.1451416, + "step": 4343, + "time_per_iteration": 2.865501880645752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073025, + "balance_loss_mlp": 1.0586009, + "epoch": 0.8357060407849173, + "flos": 984409417728.0, + "grad_norm": 0.06662147764559252, + "language_loss": 0.83445907, + "learning_rate": 6.913184438338138e-05, + "loss": 0.84518933, + "num_input_tokens_seen": 360568624, + "router_z_loss_mlp": 0.14416504, + "step": 4344, + "time_per_iteration": 3.2890896797180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076846, + "balance_loss_mlp": 1.06261289, + "epoch": 0.8358984224701809, + "flos": 843026393088.0, + "grad_norm": 0.0775623311546164, + "language_loss": 0.85284698, + "learning_rate": 6.89738650436313e-05, + "loss": 0.86361539, + "num_input_tokens_seen": 360652384, + "router_z_loss_mlp": 0.14245605, + "step": 4345, + "time_per_iteration": 3.215787410736084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074225, + "balance_loss_mlp": 1.05949068, + "epoch": 0.8360908041554445, + "flos": 626239019520.0, + "grad_norm": 0.07651611032194032, + "language_loss": 0.82082218, + "learning_rate": 6.881605304306748e-05, + "loss": 0.83156443, + "num_input_tokens_seen": 360723200, + "router_z_loss_mlp": 0.14709473, + "step": 4346, + "time_per_iteration": 2.781648635864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070263, + "balance_loss_mlp": 1.05558789, + "epoch": 0.8362831858407079, + "flos": 576068931072.0, + "grad_norm": 0.06989910685686435, + "language_loss": 0.84813631, + "learning_rate": 6.865840844295796e-05, + "loss": 0.85883898, + "num_input_tokens_seen": 360798240, + "router_z_loss_mlp": 0.14660645, + "step": 4347, + "time_per_iteration": 2.784560203552246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078519, + "balance_loss_mlp": 1.06411839, + "epoch": 0.8364755675259715, + "flos": 833783155200.0, + "grad_norm": 0.12496364806049359, + "language_loss": 0.80586934, + "learning_rate": 6.850093130450569e-05, + "loss": 0.8166545, + "num_input_tokens_seen": 360873552, + "router_z_loss_mlp": 0.1439209, + "step": 4348, + "time_per_iteration": 3.0966875553131104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074403, + "balance_loss_mlp": 1.05977595, + "epoch": 0.8366679492112351, + "flos": 582480834048.0, + "grad_norm": 0.07680322058687222, + "language_loss": 0.86201406, + "learning_rate": 6.834362168884912e-05, + "loss": 0.87275803, + "num_input_tokens_seen": 360940800, + "router_z_loss_mlp": 0.14624023, + "step": 4349, + "time_per_iteration": 2.688755989074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076536, + "balance_loss_mlp": 1.06240952, + "epoch": 0.8368603308964987, + "flos": 611722948608.0, + "grad_norm": 0.08191236295522616, + "language_loss": 0.87558603, + "learning_rate": 6.818647965706076e-05, + "loss": 0.88635135, + "num_input_tokens_seen": 361014368, + "router_z_loss_mlp": 0.14123535, + "step": 4350, + "time_per_iteration": 2.7902283668518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073123, + "balance_loss_mlp": 1.05873418, + "epoch": 0.8370527125817622, + "flos": 507264062976.0, + "grad_norm": 0.06833066188081044, + "language_loss": 0.85545194, + "learning_rate": 6.802950527014884e-05, + "loss": 0.86618322, + "num_input_tokens_seen": 361087184, + "router_z_loss_mlp": 0.14355469, + "step": 4351, + "time_per_iteration": 2.754146099090576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106708, + "balance_loss_mlp": 1.05251265, + "epoch": 0.8372450942670258, + "flos": 770952619008.0, + "grad_norm": 0.07146969997883827, + "language_loss": 0.82481229, + "learning_rate": 6.787269858905603e-05, + "loss": 0.83548313, + "num_input_tokens_seen": 361160720, + "router_z_loss_mlp": 0.14550781, + "step": 4352, + "time_per_iteration": 2.94331693649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073282, + "balance_loss_mlp": 1.05865479, + "epoch": 0.8374374759522893, + "flos": 579276168192.0, + "grad_norm": 0.07308977517607267, + "language_loss": 0.85184574, + "learning_rate": 6.771605967466033e-05, + "loss": 0.86257857, + "num_input_tokens_seen": 361234432, + "router_z_loss_mlp": 0.14611816, + "step": 4353, + "time_per_iteration": 2.693153142929077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107036, + "balance_loss_mlp": 1.0557214, + "epoch": 0.8376298576375529, + "flos": 788129699328.0, + "grad_norm": 0.10269547820167589, + "language_loss": 0.82213604, + "learning_rate": 6.755958858777434e-05, + "loss": 0.83283961, + "num_input_tokens_seen": 361309376, + "router_z_loss_mlp": 0.14624023, + "step": 4354, + "time_per_iteration": 3.0104711055755615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071035, + "balance_loss_mlp": 1.05653942, + "epoch": 0.8378222393228165, + "flos": 577613265408.0, + "grad_norm": 0.06911393067496661, + "language_loss": 0.80482757, + "learning_rate": 6.74032853891452e-05, + "loss": 0.81553793, + "num_input_tokens_seen": 361386768, + "router_z_loss_mlp": 0.14477539, + "step": 4355, + "time_per_iteration": 2.755267858505249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069203, + "balance_loss_mlp": 1.05463576, + "epoch": 0.83801462100808, + "flos": 480865766400.0, + "grad_norm": 0.07252144879258707, + "language_loss": 0.8209852, + "learning_rate": 6.724715013945548e-05, + "loss": 0.8316772, + "num_input_tokens_seen": 361456704, + "router_z_loss_mlp": 0.14550781, + "step": 4356, + "time_per_iteration": 2.6092493534088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068662, + "balance_loss_mlp": 1.05458319, + "epoch": 0.8382070026933436, + "flos": 550817044992.0, + "grad_norm": 0.07005511647028967, + "language_loss": 0.89297009, + "learning_rate": 6.709118289932226e-05, + "loss": 0.90365666, + "num_input_tokens_seen": 361533648, + "router_z_loss_mlp": 0.14074707, + "step": 4357, + "time_per_iteration": 2.8237545490264893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069868, + "balance_loss_mlp": 1.0553124, + "epoch": 0.8383993843786072, + "flos": 624968898048.0, + "grad_norm": 0.07260980188745762, + "language_loss": 0.82167578, + "learning_rate": 6.693538372929725e-05, + "loss": 0.83237451, + "num_input_tokens_seen": 361614256, + "router_z_loss_mlp": 0.14538574, + "step": 4358, + "time_per_iteration": 2.9932587146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070372, + "balance_loss_mlp": 1.0557332, + "epoch": 0.8385917660638708, + "flos": 491169153024.0, + "grad_norm": 0.13657826580523555, + "language_loss": 0.86348242, + "learning_rate": 6.677975268986719e-05, + "loss": 0.8741861, + "num_input_tokens_seen": 361679008, + "router_z_loss_mlp": 0.14611816, + "step": 4359, + "time_per_iteration": 2.6329987049102783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071816, + "balance_loss_mlp": 1.05714154, + "epoch": 0.8387841477491342, + "flos": 466900692480.0, + "grad_norm": 0.07525835690119967, + "language_loss": 0.87460434, + "learning_rate": 6.662428984145336e-05, + "loss": 0.88532257, + "num_input_tokens_seen": 361747600, + "router_z_loss_mlp": 0.14660645, + "step": 4360, + "time_per_iteration": 2.627370834350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01022038, + "balance_loss_mlp": 1.01560092, + "epoch": 0.8389765294343978, + "flos": 1564188475392.0, + "grad_norm": 0.012085873021789567, + "language_loss": 0.71780187, + "learning_rate": 6.646899524441175e-05, + "loss": 0.72802228, + "num_input_tokens_seen": 361983104, + "router_z_loss_mlp": 0.06445312, + "step": 4361, + "time_per_iteration": 5.010459899902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_mlp": 1.05930793, + "epoch": 0.8391689111196614, + "flos": 602160708096.0, + "grad_norm": 0.0572272886330789, + "language_loss": 0.82823777, + "learning_rate": 6.631386895903308e-05, + "loss": 0.83897257, + "num_input_tokens_seen": 362065824, + "router_z_loss_mlp": 0.14160156, + "step": 4362, + "time_per_iteration": 2.922370195388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073502, + "balance_loss_mlp": 1.05868399, + "epoch": 0.839361292804925, + "flos": 443047408128.0, + "grad_norm": 0.07860182159068019, + "language_loss": 0.80037236, + "learning_rate": 6.615891104554261e-05, + "loss": 0.8111074, + "num_input_tokens_seen": 362128240, + "router_z_loss_mlp": 0.14807129, + "step": 4363, + "time_per_iteration": 2.502601146697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072227, + "balance_loss_mlp": 1.05711174, + "epoch": 0.8395536744901886, + "flos": 594167768064.0, + "grad_norm": 0.07291966269797463, + "language_loss": 0.82605469, + "learning_rate": 6.600412156410057e-05, + "loss": 0.83677697, + "num_input_tokens_seen": 362198256, + "router_z_loss_mlp": 0.15100098, + "step": 4364, + "time_per_iteration": 2.713050365447998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076545, + "balance_loss_mlp": 1.06210876, + "epoch": 0.8397460561754521, + "flos": 889836171264.0, + "grad_norm": 0.07837593762341759, + "language_loss": 0.84887516, + "learning_rate": 6.58495005748016e-05, + "loss": 0.8596406, + "num_input_tokens_seen": 362279792, + "router_z_loss_mlp": 0.14416504, + "step": 4365, + "time_per_iteration": 3.1587257385253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075757, + "balance_loss_mlp": 1.06149936, + "epoch": 0.8399384378607156, + "flos": 553503020544.0, + "grad_norm": 0.06763724554244926, + "language_loss": 0.89107072, + "learning_rate": 6.569504813767463e-05, + "loss": 0.90182829, + "num_input_tokens_seen": 362351712, + "router_z_loss_mlp": 0.14257812, + "step": 4366, + "time_per_iteration": 2.629777193069458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107536, + "balance_loss_mlp": 1.06095958, + "epoch": 0.8401308195459792, + "flos": 518923832832.0, + "grad_norm": 0.061847950182012404, + "language_loss": 0.83264184, + "learning_rate": 6.554076431268341e-05, + "loss": 0.84339547, + "num_input_tokens_seen": 362423424, + "router_z_loss_mlp": 0.14404297, + "step": 4367, + "time_per_iteration": 2.659771680831909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107662, + "balance_loss_mlp": 1.06249356, + "epoch": 0.8403232012312428, + "flos": 684933221376.0, + "grad_norm": 0.07038928746315512, + "language_loss": 0.80698526, + "learning_rate": 6.538664915972648e-05, + "loss": 0.81775153, + "num_input_tokens_seen": 362514704, + "router_z_loss_mlp": 0.14123535, + "step": 4368, + "time_per_iteration": 3.017886161804199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072638, + "balance_loss_mlp": 1.0580225, + "epoch": 0.8405155829165063, + "flos": 577672736256.0, + "grad_norm": 0.07391469226483313, + "language_loss": 0.77268881, + "learning_rate": 6.523270273863652e-05, + "loss": 0.7834152, + "num_input_tokens_seen": 362581296, + "router_z_loss_mlp": 0.14587402, + "step": 4369, + "time_per_iteration": 2.6887683868408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073294, + "balance_loss_mlp": 1.05898881, + "epoch": 0.8407079646017699, + "flos": 456627041280.0, + "grad_norm": 0.12071561647223925, + "language_loss": 0.87840384, + "learning_rate": 6.507892510918079e-05, + "loss": 0.88913679, + "num_input_tokens_seen": 362648304, + "router_z_loss_mlp": 0.14294434, + "step": 4370, + "time_per_iteration": 2.521331548690796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073523, + "balance_loss_mlp": 1.05930161, + "epoch": 0.8409003462870335, + "flos": 534917426688.0, + "grad_norm": 0.07405697321132997, + "language_loss": 0.81616879, + "learning_rate": 6.492531633106114e-05, + "loss": 0.82690406, + "num_input_tokens_seen": 362721264, + "router_z_loss_mlp": 0.14221191, + "step": 4371, + "time_per_iteration": 2.8012852668762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076911, + "balance_loss_mlp": 1.06248665, + "epoch": 0.8410927279722971, + "flos": 556759443456.0, + "grad_norm": 0.17788784846398228, + "language_loss": 0.77741635, + "learning_rate": 6.477187646391374e-05, + "loss": 0.78818548, + "num_input_tokens_seen": 362795312, + "router_z_loss_mlp": 0.14404297, + "step": 4372, + "time_per_iteration": 2.7866506576538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023226, + "balance_loss_mlp": 1.01678848, + "epoch": 0.8412851096575606, + "flos": 1549754270208.0, + "grad_norm": 0.01277325762112691, + "language_loss": 0.77679121, + "learning_rate": 6.461860556730925e-05, + "loss": 0.78702348, + "num_input_tokens_seen": 363026272, + "router_z_loss_mlp": 0.06445312, + "step": 4373, + "time_per_iteration": 4.969724655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079668, + "balance_loss_mlp": 1.06552935, + "epoch": 0.8414774913428241, + "flos": 552042749952.0, + "grad_norm": 0.10963981895984921, + "language_loss": 0.79011232, + "learning_rate": 6.446550370075271e-05, + "loss": 0.80090904, + "num_input_tokens_seen": 363098384, + "router_z_loss_mlp": 0.14147949, + "step": 4374, + "time_per_iteration": 2.7151315212249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079428, + "balance_loss_mlp": 1.06480074, + "epoch": 0.8416698730280877, + "flos": 573015140352.0, + "grad_norm": 0.06677084771491004, + "language_loss": 0.77023661, + "learning_rate": 6.431257092368336e-05, + "loss": 0.78103089, + "num_input_tokens_seen": 363170960, + "router_z_loss_mlp": 0.14611816, + "step": 4375, + "time_per_iteration": 2.6808011531829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107382, + "balance_loss_mlp": 1.05907393, + "epoch": 0.8418622547133513, + "flos": 758731940352.0, + "grad_norm": 0.09075700701482696, + "language_loss": 0.80288577, + "learning_rate": 6.415980729547543e-05, + "loss": 0.81362402, + "num_input_tokens_seen": 363242000, + "router_z_loss_mlp": 0.1472168, + "step": 4376, + "time_per_iteration": 2.951115608215332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075249, + "balance_loss_mlp": 1.0609082, + "epoch": 0.8420546363986149, + "flos": 1074156940800.0, + "grad_norm": 0.09043332509327401, + "language_loss": 0.72320813, + "learning_rate": 6.40072128754366e-05, + "loss": 0.73396063, + "num_input_tokens_seen": 363340288, + "router_z_loss_mlp": 0.14343262, + "step": 4377, + "time_per_iteration": 3.411957025527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075222, + "balance_loss_mlp": 1.06063056, + "epoch": 0.8422470180838784, + "flos": 525908754432.0, + "grad_norm": 0.09960608064306599, + "language_loss": 0.82466877, + "learning_rate": 6.385478772280933e-05, + "loss": 0.83542103, + "num_input_tokens_seen": 363416208, + "router_z_loss_mlp": 0.14575195, + "step": 4378, + "time_per_iteration": 2.7343966960906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074621, + "balance_loss_mlp": 1.06004119, + "epoch": 0.842439399769142, + "flos": 600834060288.0, + "grad_norm": 0.0684628860225588, + "language_loss": 0.82174343, + "learning_rate": 6.370253189677038e-05, + "loss": 0.83248967, + "num_input_tokens_seen": 363492864, + "router_z_loss_mlp": 0.14562988, + "step": 4379, + "time_per_iteration": 2.743713140487671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078112, + "balance_loss_mlp": 1.06365216, + "epoch": 0.8426317814544055, + "flos": 552222987264.0, + "grad_norm": 0.07159027255471458, + "language_loss": 0.869488, + "learning_rate": 6.355044545643073e-05, + "loss": 0.88026911, + "num_input_tokens_seen": 363572000, + "router_z_loss_mlp": 0.14440918, + "step": 4380, + "time_per_iteration": 2.8095319271087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076585, + "balance_loss_mlp": 1.06231618, + "epoch": 0.8428241631396691, + "flos": 678832980480.0, + "grad_norm": 0.07156323252818027, + "language_loss": 0.77553236, + "learning_rate": 6.33985284608356e-05, + "loss": 0.78629822, + "num_input_tokens_seen": 363646480, + "router_z_loss_mlp": 0.1427002, + "step": 4381, + "time_per_iteration": 2.8225574493408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079603, + "balance_loss_mlp": 1.06550026, + "epoch": 0.8430165448249327, + "flos": 753730748928.0, + "grad_norm": 0.060495968283249074, + "language_loss": 0.79683161, + "learning_rate": 6.324678096896435e-05, + "loss": 0.80762756, + "num_input_tokens_seen": 363737552, + "router_z_loss_mlp": 0.14099121, + "step": 4382, + "time_per_iteration": 3.090226650238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079802, + "balance_loss_mlp": 1.06586623, + "epoch": 0.8432089265101962, + "flos": 699140574720.0, + "grad_norm": 0.06822593281534445, + "language_loss": 0.80561733, + "learning_rate": 6.30952030397306e-05, + "loss": 0.81641531, + "num_input_tokens_seen": 363816016, + "router_z_loss_mlp": 0.13952637, + "step": 4383, + "time_per_iteration": 2.902010917663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_mlp": 1.06318569, + "epoch": 0.8434013081954598, + "flos": 485767839744.0, + "grad_norm": 0.0829760023739812, + "language_loss": 0.84329182, + "learning_rate": 6.294379473198208e-05, + "loss": 0.85406601, + "num_input_tokens_seen": 363888192, + "router_z_loss_mlp": 0.14233398, + "step": 4384, + "time_per_iteration": 2.672295570373535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077907, + "balance_loss_mlp": 1.06353092, + "epoch": 0.8435936898807234, + "flos": 520623811584.0, + "grad_norm": 0.09380658686475808, + "language_loss": 0.85271668, + "learning_rate": 6.279255610450068e-05, + "loss": 0.86349577, + "num_input_tokens_seen": 363953904, + "router_z_loss_mlp": 0.14355469, + "step": 4385, + "time_per_iteration": 2.6639716625213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079599, + "balance_loss_mlp": 1.06513858, + "epoch": 0.843786071565987, + "flos": 785945534976.0, + "grad_norm": 0.07988119482228719, + "language_loss": 0.80665654, + "learning_rate": 6.264148721600254e-05, + "loss": 0.81745255, + "num_input_tokens_seen": 364031552, + "router_z_loss_mlp": 0.14453125, + "step": 4386, + "time_per_iteration": 3.0548393726348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01020283, + "balance_loss_mlp": 1.0138458, + "epoch": 0.8439784532512504, + "flos": 1446278436864.0, + "grad_norm": 0.009203156956610654, + "language_loss": 0.75836509, + "learning_rate": 6.24905881251378e-05, + "loss": 0.76856798, + "num_input_tokens_seen": 364256480, + "router_z_loss_mlp": 0.06445312, + "step": 4387, + "time_per_iteration": 4.945947170257568 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107942, + "balance_loss_mlp": 1.06507921, + "epoch": 0.844170834936514, + "flos": 708700243968.0, + "grad_norm": 0.08582903171575712, + "language_loss": 0.82610214, + "learning_rate": 6.23398588904906e-05, + "loss": 0.8368963, + "num_input_tokens_seen": 364329696, + "router_z_loss_mlp": 0.14343262, + "step": 4388, + "time_per_iteration": 2.879181385040283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079902, + "balance_loss_mlp": 1.06535816, + "epoch": 0.8443632166217776, + "flos": 483428030976.0, + "grad_norm": 0.07348538767622947, + "language_loss": 0.79642034, + "learning_rate": 6.218929957057922e-05, + "loss": 0.80721939, + "num_input_tokens_seen": 364400944, + "router_z_loss_mlp": 0.14526367, + "step": 4389, + "time_per_iteration": 2.6795496940612793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080974, + "balance_loss_mlp": 1.06649029, + "epoch": 0.8445555983070412, + "flos": 678694588416.0, + "grad_norm": 0.07673938165161245, + "language_loss": 0.80120802, + "learning_rate": 6.2038910223856e-05, + "loss": 0.81201774, + "num_input_tokens_seen": 364475744, + "router_z_loss_mlp": 0.14453125, + "step": 4390, + "time_per_iteration": 2.8635592460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080009, + "balance_loss_mlp": 1.0658704, + "epoch": 0.8447479799923048, + "flos": 741485477376.0, + "grad_norm": 0.07689839370014714, + "language_loss": 0.7424233, + "learning_rate": 6.18886909087073e-05, + "loss": 0.75322342, + "num_input_tokens_seen": 364557248, + "router_z_loss_mlp": 0.14135742, + "step": 4391, + "time_per_iteration": 2.977808952331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080831, + "balance_loss_mlp": 1.06648993, + "epoch": 0.8449403616775683, + "flos": 953306537472.0, + "grad_norm": 0.06854882269895202, + "language_loss": 0.80483949, + "learning_rate": 6.173864168345344e-05, + "loss": 0.81564778, + "num_input_tokens_seen": 364647856, + "router_z_loss_mlp": 0.14343262, + "step": 4392, + "time_per_iteration": 3.2617368698120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083639, + "balance_loss_mlp": 1.06892824, + "epoch": 0.8451327433628318, + "flos": 657363921408.0, + "grad_norm": 0.08738597947785028, + "language_loss": 0.72036451, + "learning_rate": 6.158876260634871e-05, + "loss": 0.73120093, + "num_input_tokens_seen": 364728848, + "router_z_loss_mlp": 0.14685059, + "step": 4393, + "time_per_iteration": 2.9041545391082764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107506, + "balance_loss_mlp": 1.06110108, + "epoch": 0.8453251250480954, + "flos": 446113681920.0, + "grad_norm": 0.08852500649821744, + "language_loss": 0.83482921, + "learning_rate": 6.143905373558112e-05, + "loss": 0.84557986, + "num_input_tokens_seen": 364794032, + "router_z_loss_mlp": 0.13977051, + "step": 4394, + "time_per_iteration": 2.5223376750946045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079032, + "balance_loss_mlp": 1.06472635, + "epoch": 0.845517506733359, + "flos": 542767205376.0, + "grad_norm": 0.08312411172641776, + "language_loss": 0.71075082, + "learning_rate": 6.128951512927305e-05, + "loss": 0.72154111, + "num_input_tokens_seen": 364868624, + "router_z_loss_mlp": 0.14294434, + "step": 4395, + "time_per_iteration": 2.676872968673706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076651, + "balance_loss_mlp": 1.06210768, + "epoch": 0.8457098884186226, + "flos": 502440910848.0, + "grad_norm": 0.09142879827690771, + "language_loss": 0.84363878, + "learning_rate": 6.114014684548046e-05, + "loss": 0.85440528, + "num_input_tokens_seen": 364938208, + "router_z_loss_mlp": 0.14526367, + "step": 4396, + "time_per_iteration": 2.6433725357055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078224, + "balance_loss_mlp": 1.06416929, + "epoch": 0.8459022701038861, + "flos": 448893259776.0, + "grad_norm": 0.0794015178696456, + "language_loss": 0.79516685, + "learning_rate": 6.099094894219326e-05, + "loss": 0.80594903, + "num_input_tokens_seen": 365009440, + "router_z_loss_mlp": 0.14050293, + "step": 4397, + "time_per_iteration": 2.7340524196624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072834, + "balance_loss_mlp": 1.05860019, + "epoch": 0.8460946517891497, + "flos": 743178115584.0, + "grad_norm": 0.0800433568929215, + "language_loss": 0.75171196, + "learning_rate": 6.0841921477335194e-05, + "loss": 0.76244032, + "num_input_tokens_seen": 365085904, + "router_z_loss_mlp": 0.14233398, + "step": 4398, + "time_per_iteration": 2.9435505867004395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076139, + "balance_loss_mlp": 1.06173849, + "epoch": 0.8462870334744133, + "flos": 553216324608.0, + "grad_norm": 0.10324502308758304, + "language_loss": 0.79907882, + "learning_rate": 6.069306450876389e-05, + "loss": 0.8098402, + "num_input_tokens_seen": 365163600, + "router_z_loss_mlp": 0.14379883, + "step": 4399, + "time_per_iteration": 2.844953775405884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01019199, + "balance_loss_mlp": 1.01285696, + "epoch": 0.8464794151596768, + "flos": 1564877864448.0, + "grad_norm": 0.008987182003831137, + "language_loss": 0.81708568, + "learning_rate": 6.054437809427071e-05, + "loss": 0.82727766, + "num_input_tokens_seen": 365384528, + "router_z_loss_mlp": 0.06347656, + "step": 4400, + "time_per_iteration": 4.9445812702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072123, + "balance_loss_mlp": 1.05743694, + "epoch": 0.8466717968449403, + "flos": 550197038592.0, + "grad_norm": 0.06222883807624679, + "language_loss": 0.79746759, + "learning_rate": 6.039586229158084e-05, + "loss": 0.80818892, + "num_input_tokens_seen": 365453760, + "router_z_loss_mlp": 0.14685059, + "step": 4401, + "time_per_iteration": 2.7119193077087402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074863, + "balance_loss_mlp": 1.06054568, + "epoch": 0.8468641785302039, + "flos": 551919038976.0, + "grad_norm": 0.06716515000041562, + "language_loss": 0.84632695, + "learning_rate": 6.024751715835314e-05, + "loss": 0.85707557, + "num_input_tokens_seen": 365532416, + "router_z_loss_mlp": 0.14294434, + "step": 4402, + "time_per_iteration": 2.781859874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072431, + "balance_loss_mlp": 1.05786383, + "epoch": 0.8470565602154675, + "flos": 572671544832.0, + "grad_norm": 0.14264875428102675, + "language_loss": 0.87237591, + "learning_rate": 6.009934275218049e-05, + "loss": 0.88310021, + "num_input_tokens_seen": 365603776, + "router_z_loss_mlp": 0.14550781, + "step": 4403, + "time_per_iteration": 2.743601083755493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_mlp": 1.06340051, + "epoch": 0.8472489419007311, + "flos": 472833179136.0, + "grad_norm": 0.08442038658204883, + "language_loss": 0.83985877, + "learning_rate": 5.995133913058936e-05, + "loss": 0.85063827, + "num_input_tokens_seen": 365670432, + "router_z_loss_mlp": 0.14526367, + "step": 4404, + "time_per_iteration": 2.5450549125671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076393, + "balance_loss_mlp": 1.062374, + "epoch": 0.8474413235859947, + "flos": 798020481024.0, + "grad_norm": 0.06525598826964277, + "language_loss": 0.795784, + "learning_rate": 5.980350635103954e-05, + "loss": 0.80654788, + "num_input_tokens_seen": 365741584, + "router_z_loss_mlp": 0.14025879, + "step": 4405, + "time_per_iteration": 2.9938158988952637 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077872, + "balance_loss_mlp": 1.06393683, + "epoch": 0.8476337052712581, + "flos": 502379241984.0, + "grad_norm": 0.07458633653372311, + "language_loss": 0.80359912, + "learning_rate": 5.9655844470924866e-05, + "loss": 0.8143779, + "num_input_tokens_seen": 365805344, + "router_z_loss_mlp": 0.13934326, + "step": 4406, + "time_per_iteration": 2.5586020946502686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074026, + "balance_loss_mlp": 1.05969727, + "epoch": 0.8478260869565217, + "flos": 931971101184.0, + "grad_norm": 0.0649551452480515, + "language_loss": 0.83187521, + "learning_rate": 5.9508353547573e-05, + "loss": 0.84261543, + "num_input_tokens_seen": 365890976, + "router_z_loss_mlp": 0.14343262, + "step": 4407, + "time_per_iteration": 3.2481842041015625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077659, + "balance_loss_mlp": 1.0633297, + "epoch": 0.8480184686417853, + "flos": 708811471872.0, + "grad_norm": 0.0832752237181532, + "language_loss": 0.80765074, + "learning_rate": 5.9361033638244855e-05, + "loss": 0.81842732, + "num_input_tokens_seen": 365968912, + "router_z_loss_mlp": 0.14306641, + "step": 4408, + "time_per_iteration": 2.901926279067993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075621, + "balance_loss_mlp": 1.06122029, + "epoch": 0.8482108503270489, + "flos": 614440857600.0, + "grad_norm": 0.06678078731558451, + "language_loss": 0.8214063, + "learning_rate": 5.9213884800135066e-05, + "loss": 0.8321625, + "num_input_tokens_seen": 366047680, + "router_z_loss_mlp": 0.14379883, + "step": 4409, + "time_per_iteration": 2.829897403717041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075038, + "balance_loss_mlp": 1.06055427, + "epoch": 0.8484032320123124, + "flos": 531016031232.0, + "grad_norm": 0.08086645135201266, + "language_loss": 0.82160944, + "learning_rate": 5.906690709037194e-05, + "loss": 0.83235979, + "num_input_tokens_seen": 366118720, + "router_z_loss_mlp": 0.14477539, + "step": 4410, + "time_per_iteration": 2.660163164138794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01024272, + "balance_loss_mlp": 1.01792979, + "epoch": 0.848595613697576, + "flos": 1542776315904.0, + "grad_norm": 0.013103843821954883, + "language_loss": 0.76296914, + "learning_rate": 5.892010056601726e-05, + "loss": 0.77321184, + "num_input_tokens_seen": 366346928, + "router_z_loss_mlp": 0.06347656, + "step": 4411, + "time_per_iteration": 4.932765007019043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074439, + "balance_loss_mlp": 1.06016994, + "epoch": 0.8487879953828396, + "flos": 677342974464.0, + "grad_norm": 0.07691974451074937, + "language_loss": 0.737957, + "learning_rate": 5.877346528406635e-05, + "loss": 0.74870145, + "num_input_tokens_seen": 366422848, + "router_z_loss_mlp": 0.1427002, + "step": 4412, + "time_per_iteration": 2.9196579456329346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070882, + "balance_loss_mlp": 1.05686271, + "epoch": 0.8489803770681031, + "flos": 503673956352.0, + "grad_norm": 0.0819904874112488, + "language_loss": 0.79105639, + "learning_rate": 5.8627001301448105e-05, + "loss": 0.8017652, + "num_input_tokens_seen": 366492016, + "router_z_loss_mlp": 0.14025879, + "step": 4413, + "time_per_iteration": 2.661724328994751 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076909, + "balance_loss_mlp": 1.06291389, + "epoch": 0.8491727587533667, + "flos": 563186027520.0, + "grad_norm": 0.06775622187053532, + "language_loss": 0.77081567, + "learning_rate": 5.84807086750247e-05, + "loss": 0.78158486, + "num_input_tokens_seen": 366566400, + "router_z_loss_mlp": 0.14001465, + "step": 4414, + "time_per_iteration": 2.8016960620880127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071963, + "balance_loss_mlp": 1.0574671, + "epoch": 0.8493651404386302, + "flos": 459784719360.0, + "grad_norm": 0.09984055773639101, + "language_loss": 0.7783742, + "learning_rate": 5.833458746159243e-05, + "loss": 0.78909385, + "num_input_tokens_seen": 366634016, + "router_z_loss_mlp": 0.14489746, + "step": 4415, + "time_per_iteration": 2.5576140880584717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075711, + "balance_loss_mlp": 1.06144118, + "epoch": 0.8495575221238938, + "flos": 461170838016.0, + "grad_norm": 0.09739646427251167, + "language_loss": 0.81540161, + "learning_rate": 5.818863771788013e-05, + "loss": 0.82615876, + "num_input_tokens_seen": 366704384, + "router_z_loss_mlp": 0.14257812, + "step": 4416, + "time_per_iteration": 2.6097960472106934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_mlp": 1.05833459, + "epoch": 0.8497499038091574, + "flos": 870712063488.0, + "grad_norm": 0.11039248920807271, + "language_loss": 0.81449503, + "learning_rate": 5.8042859500550604e-05, + "loss": 0.82521868, + "num_input_tokens_seen": 366785456, + "router_z_loss_mlp": 0.14038086, + "step": 4417, + "time_per_iteration": 3.0810201168060303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071844, + "balance_loss_mlp": 1.05743134, + "epoch": 0.849942285494421, + "flos": 779600443392.0, + "grad_norm": 0.09244345650082934, + "language_loss": 0.78268075, + "learning_rate": 5.789725286620018e-05, + "loss": 0.79339921, + "num_input_tokens_seen": 366862848, + "router_z_loss_mlp": 0.14404297, + "step": 4418, + "time_per_iteration": 3.004802703857422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076267, + "balance_loss_mlp": 1.06206918, + "epoch": 0.8501346671796844, + "flos": 513816556032.0, + "grad_norm": 0.06901259436124493, + "language_loss": 0.85190952, + "learning_rate": 5.775181787135819e-05, + "loss": 0.86267221, + "num_input_tokens_seen": 366934800, + "router_z_loss_mlp": 0.1418457, + "step": 4419, + "time_per_iteration": 2.701456308364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075748, + "balance_loss_mlp": 1.06159818, + "epoch": 0.850327048864948, + "flos": 621445602816.0, + "grad_norm": 0.06970940414254242, + "language_loss": 0.83750409, + "learning_rate": 5.76065545724877e-05, + "loss": 0.84826154, + "num_input_tokens_seen": 367015152, + "router_z_loss_mlp": 0.14147949, + "step": 4420, + "time_per_iteration": 2.8450427055358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073069, + "balance_loss_mlp": 1.05829954, + "epoch": 0.8505194305502116, + "flos": 774221524992.0, + "grad_norm": 0.06343395396056568, + "language_loss": 0.79527402, + "learning_rate": 5.746146302598454e-05, + "loss": 0.8060047, + "num_input_tokens_seen": 367092192, + "router_z_loss_mlp": 0.14758301, + "step": 4421, + "time_per_iteration": 3.0368168354034424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010731, + "balance_loss_mlp": 1.05916452, + "epoch": 0.8507118122354752, + "flos": 465257613312.0, + "grad_norm": 0.06692154543848765, + "language_loss": 0.86414826, + "learning_rate": 5.731654328817859e-05, + "loss": 0.8748793, + "num_input_tokens_seen": 367159744, + "router_z_loss_mlp": 0.13964844, + "step": 4422, + "time_per_iteration": 2.5675909519195557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080967, + "balance_loss_mlp": 1.06668544, + "epoch": 0.8509041939207388, + "flos": 534413417472.0, + "grad_norm": 0.06814499560191878, + "language_loss": 0.84655517, + "learning_rate": 5.717179541533257e-05, + "loss": 0.85736477, + "num_input_tokens_seen": 367226384, + "router_z_loss_mlp": 0.14282227, + "step": 4423, + "time_per_iteration": 2.6630845069885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074411, + "balance_loss_mlp": 1.06011748, + "epoch": 0.8510965756060023, + "flos": 583738472448.0, + "grad_norm": 0.07713370691386924, + "language_loss": 0.83968955, + "learning_rate": 5.702721946364264e-05, + "loss": 0.85043365, + "num_input_tokens_seen": 367294768, + "router_z_loss_mlp": 0.14306641, + "step": 4424, + "time_per_iteration": 2.678980827331543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071118, + "balance_loss_mlp": 1.05659819, + "epoch": 0.8512889572912659, + "flos": 600841400832.0, + "grad_norm": 0.06685200855630355, + "language_loss": 0.77975464, + "learning_rate": 5.688281548923796e-05, + "loss": 0.79046577, + "num_input_tokens_seen": 367372368, + "router_z_loss_mlp": 0.1451416, + "step": 4425, + "time_per_iteration": 2.7655956745147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070813, + "balance_loss_mlp": 1.05638838, + "epoch": 0.8514813389765294, + "flos": 654791745024.0, + "grad_norm": 0.07982187700581499, + "language_loss": 0.78191173, + "learning_rate": 5.673858354818151e-05, + "loss": 0.79261982, + "num_input_tokens_seen": 367452656, + "router_z_loss_mlp": 0.14416504, + "step": 4426, + "time_per_iteration": 2.9217934608459473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076173, + "balance_loss_mlp": 1.0619514, + "epoch": 0.851673720661793, + "flos": 429761811456.0, + "grad_norm": 0.1625431829590372, + "language_loss": 0.78373289, + "learning_rate": 5.6594523696468726e-05, + "loss": 0.79449469, + "num_input_tokens_seen": 367517808, + "router_z_loss_mlp": 0.14221191, + "step": 4427, + "time_per_iteration": 2.58944034576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073431, + "balance_loss_mlp": 1.05903041, + "epoch": 0.8518661023470565, + "flos": 641572959744.0, + "grad_norm": 0.07087664669883431, + "language_loss": 0.79935998, + "learning_rate": 5.645063599002875e-05, + "loss": 0.8100943, + "num_input_tokens_seen": 367591728, + "router_z_loss_mlp": 0.1439209, + "step": 4428, + "time_per_iteration": 2.7852087020874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074309, + "balance_loss_mlp": 1.06025457, + "epoch": 0.8520584840323201, + "flos": 562143504384.0, + "grad_norm": 0.06571018676034746, + "language_loss": 0.79440582, + "learning_rate": 5.630692048472363e-05, + "loss": 0.8051489, + "num_input_tokens_seen": 367664496, + "router_z_loss_mlp": 0.140625, + "step": 4429, + "time_per_iteration": 2.6801624298095703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070263, + "balance_loss_mlp": 1.05610132, + "epoch": 0.8522508657175837, + "flos": 527050395648.0, + "grad_norm": 0.07096995462733162, + "language_loss": 0.78549665, + "learning_rate": 5.61633772363489e-05, + "loss": 0.79619926, + "num_input_tokens_seen": 367735584, + "router_z_loss_mlp": 0.14147949, + "step": 4430, + "time_per_iteration": 2.6519312858581543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071162, + "balance_loss_mlp": 1.05683255, + "epoch": 0.8524432474028473, + "flos": 499120247808.0, + "grad_norm": 0.08116181214962478, + "language_loss": 0.80567259, + "learning_rate": 5.602000630063298e-05, + "loss": 0.8163842, + "num_input_tokens_seen": 367801136, + "router_z_loss_mlp": 0.14318848, + "step": 4431, + "time_per_iteration": 2.5764808654785156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069916, + "balance_loss_mlp": 1.05516994, + "epoch": 0.8526356290881109, + "flos": 421314048000.0, + "grad_norm": 0.0903842329917801, + "language_loss": 0.79655671, + "learning_rate": 5.587680773323706e-05, + "loss": 0.80725586, + "num_input_tokens_seen": 367865312, + "router_z_loss_mlp": 0.14709473, + "step": 4432, + "time_per_iteration": 2.488812208175659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107009, + "balance_loss_mlp": 1.0557611, + "epoch": 0.8528280107733743, + "flos": 507328303104.0, + "grad_norm": 0.0816751718621874, + "language_loss": 0.8067739, + "learning_rate": 5.5733781589756115e-05, + "loss": 0.81747478, + "num_input_tokens_seen": 367931104, + "router_z_loss_mlp": 0.14331055, + "step": 4433, + "time_per_iteration": 2.6227025985717773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073907, + "balance_loss_mlp": 1.06001878, + "epoch": 0.8530203924586379, + "flos": 445893797376.0, + "grad_norm": 0.08095349591121923, + "language_loss": 0.82720852, + "learning_rate": 5.5590927925717684e-05, + "loss": 0.83794761, + "num_input_tokens_seen": 367995520, + "router_z_loss_mlp": 0.13891602, + "step": 4434, + "time_per_iteration": 2.5052199363708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069434, + "balance_loss_mlp": 1.05520046, + "epoch": 0.8532127741439015, + "flos": 657759273984.0, + "grad_norm": 0.07769115756981526, + "language_loss": 0.83331203, + "learning_rate": 5.54482467965825e-05, + "loss": 0.84400636, + "num_input_tokens_seen": 368073664, + "router_z_loss_mlp": 0.14221191, + "step": 4435, + "time_per_iteration": 2.8407375812530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066367, + "balance_loss_mlp": 1.05220532, + "epoch": 0.8534051558291651, + "flos": 536019420672.0, + "grad_norm": 0.06062923290615588, + "language_loss": 0.82938188, + "learning_rate": 5.5305738257744264e-05, + "loss": 0.84004557, + "num_input_tokens_seen": 368147536, + "router_z_loss_mlp": 0.14160156, + "step": 4436, + "time_per_iteration": 2.721763849258423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076869, + "balance_loss_mlp": 1.0626117, + "epoch": 0.8535975375144286, + "flos": 533000134656.0, + "grad_norm": 0.08849975131180282, + "language_loss": 0.79207104, + "learning_rate": 5.5163402364529655e-05, + "loss": 0.80283976, + "num_input_tokens_seen": 368218672, + "router_z_loss_mlp": 0.14257812, + "step": 4437, + "time_per_iteration": 2.714531898498535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106703, + "balance_loss_mlp": 1.05290413, + "epoch": 0.8537899191996922, + "flos": 574141727232.0, + "grad_norm": 0.08108043439435358, + "language_loss": 0.8220486, + "learning_rate": 5.502123917219848e-05, + "loss": 0.83271891, + "num_input_tokens_seen": 368287056, + "router_z_loss_mlp": 0.14123535, + "step": 4438, + "time_per_iteration": 2.6950736045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_mlp": 1.05088568, + "epoch": 0.8539823008849557, + "flos": 465007993344.0, + "grad_norm": 0.06606452080680034, + "language_loss": 0.83545029, + "learning_rate": 5.48792487359433e-05, + "loss": 0.84610963, + "num_input_tokens_seen": 368358400, + "router_z_loss_mlp": 0.15014648, + "step": 4439, + "time_per_iteration": 2.6966865062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069848, + "balance_loss_mlp": 1.0552212, + "epoch": 0.8541746825702193, + "flos": 554713671168.0, + "grad_norm": 0.07917909499890975, + "language_loss": 0.81682485, + "learning_rate": 5.4737431110889745e-05, + "loss": 0.82752335, + "num_input_tokens_seen": 368427168, + "router_z_loss_mlp": 0.14599609, + "step": 4440, + "time_per_iteration": 2.716801404953003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067798, + "balance_loss_mlp": 1.05304027, + "epoch": 0.8543670642554829, + "flos": 546391816704.0, + "grad_norm": 0.07436951957293847, + "language_loss": 0.77523911, + "learning_rate": 5.4595786352096165e-05, + "loss": 0.78591704, + "num_input_tokens_seen": 368503584, + "router_z_loss_mlp": 0.14733887, + "step": 4441, + "time_per_iteration": 2.8399622440338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071802, + "balance_loss_mlp": 1.05721068, + "epoch": 0.8545594459407464, + "flos": 512027744256.0, + "grad_norm": 0.07108815231458238, + "language_loss": 0.82236481, + "learning_rate": 5.4454314514554236e-05, + "loss": 0.83308291, + "num_input_tokens_seen": 368576976, + "router_z_loss_mlp": 0.14575195, + "step": 4442, + "time_per_iteration": 2.6311261653900146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069344, + "balance_loss_mlp": 1.05497956, + "epoch": 0.85475182762601, + "flos": 421185567744.0, + "grad_norm": 0.0731157508212472, + "language_loss": 0.81597567, + "learning_rate": 5.431301565318786e-05, + "loss": 0.8266691, + "num_input_tokens_seen": 368641664, + "router_z_loss_mlp": 0.14355469, + "step": 4443, + "time_per_iteration": 2.499732255935669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067376, + "balance_loss_mlp": 1.05295157, + "epoch": 0.8549442093112736, + "flos": 389435516928.0, + "grad_norm": 0.10168520026489293, + "language_loss": 0.77461678, + "learning_rate": 5.41718898228542e-05, + "loss": 0.78529054, + "num_input_tokens_seen": 368705616, + "router_z_loss_mlp": 0.14428711, + "step": 4444, + "time_per_iteration": 2.5191171169281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065539, + "balance_loss_mlp": 1.05132949, + "epoch": 0.8551365909965372, + "flos": 605926282752.0, + "grad_norm": 0.10020390821281198, + "language_loss": 0.79534721, + "learning_rate": 5.403093707834334e-05, + "loss": 0.80600262, + "num_input_tokens_seen": 368779664, + "router_z_loss_mlp": 0.14196777, + "step": 4445, + "time_per_iteration": 2.80684757232666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_mlp": 1.05816031, + "epoch": 0.8553289726818007, + "flos": 504160713216.0, + "grad_norm": 0.06547914097019276, + "language_loss": 0.78441411, + "learning_rate": 5.3890157474377865e-05, + "loss": 0.7951389, + "num_input_tokens_seen": 368846656, + "router_z_loss_mlp": 0.14294434, + "step": 4446, + "time_per_iteration": 2.5812063217163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070129, + "balance_loss_mlp": 1.05559802, + "epoch": 0.8555213543670642, + "flos": 557009063424.0, + "grad_norm": 0.0766106578320322, + "language_loss": 0.75942904, + "learning_rate": 5.374955106561324e-05, + "loss": 0.77013028, + "num_input_tokens_seen": 368923712, + "router_z_loss_mlp": 0.14501953, + "step": 4447, + "time_per_iteration": 2.7502357959747314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066331, + "balance_loss_mlp": 1.05213356, + "epoch": 0.8557137360523278, + "flos": 548104278528.0, + "grad_norm": 0.06446025999572932, + "language_loss": 0.74926281, + "learning_rate": 5.360911790663775e-05, + "loss": 0.75992608, + "num_input_tokens_seen": 368994496, + "router_z_loss_mlp": 0.14196777, + "step": 4448, + "time_per_iteration": 2.619159698486328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070306, + "balance_loss_mlp": 1.055691, + "epoch": 0.8559061177375914, + "flos": 728182628352.0, + "grad_norm": 0.06744228342977912, + "language_loss": 0.78711146, + "learning_rate": 5.346885805197238e-05, + "loss": 0.79781449, + "num_input_tokens_seen": 369077088, + "router_z_loss_mlp": 0.14611816, + "step": 4449, + "time_per_iteration": 2.975527286529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073651, + "balance_loss_mlp": 1.05888104, + "epoch": 0.856098499422855, + "flos": 535881028608.0, + "grad_norm": 0.09470809233033459, + "language_loss": 0.83172154, + "learning_rate": 5.332877155607085e-05, + "loss": 0.84245807, + "num_input_tokens_seen": 369147680, + "router_z_loss_mlp": 0.14758301, + "step": 4450, + "time_per_iteration": 2.6913669109344482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072051, + "balance_loss_mlp": 1.05720961, + "epoch": 0.8562908811081185, + "flos": 573664882176.0, + "grad_norm": 0.0720637583069195, + "language_loss": 0.83487344, + "learning_rate": 5.3188858473319504e-05, + "loss": 0.84559393, + "num_input_tokens_seen": 369224320, + "router_z_loss_mlp": 0.14831543, + "step": 4451, + "time_per_iteration": 2.7148618698120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068368, + "balance_loss_mlp": 1.05413437, + "epoch": 0.856483262793382, + "flos": 781754872320.0, + "grad_norm": 0.08319979714541847, + "language_loss": 0.80538082, + "learning_rate": 5.3049118858037426e-05, + "loss": 0.81606448, + "num_input_tokens_seen": 369315744, + "router_z_loss_mlp": 0.14233398, + "step": 4452, + "time_per_iteration": 3.1150898933410645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070338, + "balance_loss_mlp": 1.05599678, + "epoch": 0.8566756444786456, + "flos": 455819083776.0, + "grad_norm": 0.06133419120711316, + "language_loss": 0.84648192, + "learning_rate": 5.290955276447651e-05, + "loss": 0.85718524, + "num_input_tokens_seen": 369382800, + "router_z_loss_mlp": 0.14343262, + "step": 4453, + "time_per_iteration": 2.5603737831115723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067449, + "balance_loss_mlp": 1.05266762, + "epoch": 0.8568680261639092, + "flos": 449382587904.0, + "grad_norm": 0.09315038231056039, + "language_loss": 0.84648412, + "learning_rate": 5.277016024682091e-05, + "loss": 0.85715866, + "num_input_tokens_seen": 369447312, + "router_z_loss_mlp": 0.14782715, + "step": 4454, + "time_per_iteration": 2.5867726802825928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074372, + "balance_loss_mlp": 1.05986428, + "epoch": 0.8570604078491728, + "flos": 479976316416.0, + "grad_norm": 0.07700145526385223, + "language_loss": 0.82769418, + "learning_rate": 5.2630941359187665e-05, + "loss": 0.83843791, + "num_input_tokens_seen": 369512800, + "router_z_loss_mlp": 0.14489746, + "step": 4455, + "time_per_iteration": 2.5525221824645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106982, + "balance_loss_mlp": 1.0550859, + "epoch": 0.8572527895344363, + "flos": 505942184448.0, + "grad_norm": 0.06363308666132952, + "language_loss": 0.84937072, + "learning_rate": 5.249189615562627e-05, + "loss": 0.86006892, + "num_input_tokens_seen": 369580720, + "router_z_loss_mlp": 0.1472168, + "step": 4456, + "time_per_iteration": 2.576906681060791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069538, + "balance_loss_mlp": 1.05516171, + "epoch": 0.8574451712196999, + "flos": 787044957696.0, + "grad_norm": 0.0582073915457821, + "language_loss": 0.82954866, + "learning_rate": 5.235302469011905e-05, + "loss": 0.840244, + "num_input_tokens_seen": 369672544, + "router_z_loss_mlp": 0.14379883, + "step": 4457, + "time_per_iteration": 3.0546817779541016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062408, + "balance_loss_mlp": 1.04806721, + "epoch": 0.8576375529049635, + "flos": 509252935680.0, + "grad_norm": 0.06955438726938921, + "language_loss": 0.75106084, + "learning_rate": 5.2214327016580575e-05, + "loss": 0.76168495, + "num_input_tokens_seen": 369745776, + "router_z_loss_mlp": 0.14318848, + "step": 4458, + "time_per_iteration": 2.6937506198883057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011775, + "balance_loss_mlp": 1.00548077, + "epoch": 0.857829934590227, + "flos": 1460772486144.0, + "grad_norm": 0.008113169316068945, + "language_loss": 0.84767288, + "learning_rate": 5.207580318885802e-05, + "loss": 0.85779065, + "num_input_tokens_seen": 369975200, + "router_z_loss_mlp": 0.06298828, + "step": 4459, + "time_per_iteration": 4.911555528640747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068424, + "balance_loss_mlp": 1.05389237, + "epoch": 0.8580223162754905, + "flos": 479296839168.0, + "grad_norm": 0.145833654040799, + "language_loss": 0.89347082, + "learning_rate": 5.193745326073118e-05, + "loss": 0.90415508, + "num_input_tokens_seen": 370043296, + "router_z_loss_mlp": 0.1451416, + "step": 4460, + "time_per_iteration": 2.645474433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069776, + "balance_loss_mlp": 1.05543506, + "epoch": 0.8582146979607541, + "flos": 706231954944.0, + "grad_norm": 0.08421529829088402, + "language_loss": 0.79048121, + "learning_rate": 5.179927728591227e-05, + "loss": 0.80117893, + "num_input_tokens_seen": 370111152, + "router_z_loss_mlp": 0.14331055, + "step": 4461, + "time_per_iteration": 2.8346517086029053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071468, + "balance_loss_mlp": 1.05721021, + "epoch": 0.8584070796460177, + "flos": 765158524416.0, + "grad_norm": 0.08957550306757553, + "language_loss": 0.82738662, + "learning_rate": 5.1661275318045874e-05, + "loss": 0.83810127, + "num_input_tokens_seen": 370190272, + "router_z_loss_mlp": 0.14245605, + "step": 4462, + "time_per_iteration": 3.047076463699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069247, + "balance_loss_mlp": 1.05483508, + "epoch": 0.8585994613312813, + "flos": 586829339136.0, + "grad_norm": 0.0707996237534643, + "language_loss": 0.85821873, + "learning_rate": 5.152344741070919e-05, + "loss": 0.86891121, + "num_input_tokens_seen": 370267056, + "router_z_loss_mlp": 0.14404297, + "step": 4463, + "time_per_iteration": 2.789858102798462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066518, + "balance_loss_mlp": 1.052356, + "epoch": 0.8587918430165449, + "flos": 608295826944.0, + "grad_norm": 0.08127144245962697, + "language_loss": 0.78870726, + "learning_rate": 5.138579361741169e-05, + "loss": 0.79937249, + "num_input_tokens_seen": 370344176, + "router_z_loss_mlp": 0.14147949, + "step": 4464, + "time_per_iteration": 2.7969038486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067345, + "balance_loss_mlp": 1.05314672, + "epoch": 0.8589842247018084, + "flos": 588981570048.0, + "grad_norm": 0.07472876002121234, + "language_loss": 0.80512178, + "learning_rate": 5.124831399159535e-05, + "loss": 0.81579524, + "num_input_tokens_seen": 370414224, + "router_z_loss_mlp": 0.14196777, + "step": 4465, + "time_per_iteration": 2.736020565032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074093, + "balance_loss_mlp": 1.05929875, + "epoch": 0.8591766063870719, + "flos": 543879111168.0, + "grad_norm": 0.11520064684359647, + "language_loss": 0.78347111, + "learning_rate": 5.1111008586634475e-05, + "loss": 0.79421198, + "num_input_tokens_seen": 370484736, + "router_z_loss_mlp": 0.14758301, + "step": 4466, + "time_per_iteration": 2.7088613510131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072551, + "balance_loss_mlp": 1.05799568, + "epoch": 0.8593689880723355, + "flos": 493756010496.0, + "grad_norm": 0.07199823899248142, + "language_loss": 0.80669403, + "learning_rate": 5.0973877455835816e-05, + "loss": 0.81741953, + "num_input_tokens_seen": 370556512, + "router_z_loss_mlp": 0.14526367, + "step": 4467, + "time_per_iteration": 2.751774311065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074751, + "balance_loss_mlp": 1.06001639, + "epoch": 0.8595613697575991, + "flos": 533909408256.0, + "grad_norm": 0.07801691002975698, + "language_loss": 0.83068347, + "learning_rate": 5.083692065243822e-05, + "loss": 0.84143102, + "num_input_tokens_seen": 370622880, + "router_z_loss_mlp": 0.1472168, + "step": 4468, + "time_per_iteration": 2.6254448890686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070926, + "balance_loss_mlp": 1.05635858, + "epoch": 0.8597537514428626, + "flos": 617628271104.0, + "grad_norm": 0.07457537179448775, + "language_loss": 0.76102448, + "learning_rate": 5.070013822961328e-05, + "loss": 0.77173376, + "num_input_tokens_seen": 370691632, + "router_z_loss_mlp": 0.14562988, + "step": 4469, + "time_per_iteration": 2.78564453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106627, + "balance_loss_mlp": 1.05185747, + "epoch": 0.8599461331281262, + "flos": 608730826752.0, + "grad_norm": 0.07387770607990847, + "language_loss": 0.83740634, + "learning_rate": 5.056353024046462e-05, + "loss": 0.84806907, + "num_input_tokens_seen": 370764848, + "router_z_loss_mlp": 0.14416504, + "step": 4470, + "time_per_iteration": 2.7199819087982178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073459, + "balance_loss_mlp": 1.05884385, + "epoch": 0.8601385148133898, + "flos": 551252044800.0, + "grad_norm": 0.07776930298197288, + "language_loss": 0.83086514, + "learning_rate": 5.042709673802786e-05, + "loss": 0.84159976, + "num_input_tokens_seen": 370832496, + "router_z_loss_mlp": 0.14599609, + "step": 4471, + "time_per_iteration": 2.655369281768799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106711, + "balance_loss_mlp": 1.05244768, + "epoch": 0.8603308964986534, + "flos": 581200800768.0, + "grad_norm": 0.05601587567115835, + "language_loss": 0.80901635, + "learning_rate": 5.0290837775271494e-05, + "loss": 0.81968743, + "num_input_tokens_seen": 370917104, + "router_z_loss_mlp": 0.14648438, + "step": 4472, + "time_per_iteration": 2.8570289611816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073921, + "balance_loss_mlp": 1.05958033, + "epoch": 0.8605232781839169, + "flos": 629013828096.0, + "grad_norm": 0.0851895281729739, + "language_loss": 0.7508207, + "learning_rate": 5.0154753405095846e-05, + "loss": 0.76155984, + "num_input_tokens_seen": 370984512, + "router_z_loss_mlp": 0.14331055, + "step": 4473, + "time_per_iteration": 2.7473347187042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066806, + "balance_loss_mlp": 1.05288196, + "epoch": 0.8607156598691804, + "flos": 468141078528.0, + "grad_norm": 0.0733266349612676, + "language_loss": 0.76999867, + "learning_rate": 5.0018843680333604e-05, + "loss": 0.78066671, + "num_input_tokens_seen": 371049664, + "router_z_loss_mlp": 0.13928223, + "step": 4474, + "time_per_iteration": 2.511343002319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069976, + "balance_loss_mlp": 1.0554204, + "epoch": 0.860908041554444, + "flos": 488394344448.0, + "grad_norm": 0.06480096420670076, + "language_loss": 0.82572103, + "learning_rate": 4.988310865374945e-05, + "loss": 0.83642077, + "num_input_tokens_seen": 371120704, + "router_z_loss_mlp": 0.14550781, + "step": 4475, + "time_per_iteration": 2.6399173736572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066208, + "balance_loss_mlp": 1.05160475, + "epoch": 0.8611004232397076, + "flos": 592094831616.0, + "grad_norm": 0.08039350372940637, + "language_loss": 0.80106586, + "learning_rate": 4.974754837804057e-05, + "loss": 0.81172794, + "num_input_tokens_seen": 371189376, + "router_z_loss_mlp": 0.14575195, + "step": 4476, + "time_per_iteration": 2.7327587604522705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067278, + "balance_loss_mlp": 1.05268657, + "epoch": 0.8612928049249712, + "flos": 774209041920.0, + "grad_norm": 0.08404476635777386, + "language_loss": 0.86105013, + "learning_rate": 4.9612162905836036e-05, + "loss": 0.87172294, + "num_input_tokens_seen": 371275184, + "router_z_loss_mlp": 0.14587402, + "step": 4477, + "time_per_iteration": 3.0373780727386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070558, + "balance_loss_mlp": 1.0560379, + "epoch": 0.8614851866102347, + "flos": 537553843200.0, + "grad_norm": 0.07409303863444187, + "language_loss": 0.82399005, + "learning_rate": 4.947695228969718e-05, + "loss": 0.83469558, + "num_input_tokens_seen": 371347920, + "router_z_loss_mlp": 0.14501953, + "step": 4478, + "time_per_iteration": 2.6591262817382812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065378, + "balance_loss_mlp": 1.05058384, + "epoch": 0.8616775682954982, + "flos": 565916419584.0, + "grad_norm": 0.0631568750529317, + "language_loss": 0.78993368, + "learning_rate": 4.934191658211729e-05, + "loss": 0.80058742, + "num_input_tokens_seen": 371419728, + "router_z_loss_mlp": 0.14758301, + "step": 4479, + "time_per_iteration": 2.658097267150879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064115, + "balance_loss_mlp": 1.04950047, + "epoch": 0.8618699499807618, + "flos": 481592231424.0, + "grad_norm": 0.08056621333119694, + "language_loss": 0.81684464, + "learning_rate": 4.92070558355221e-05, + "loss": 0.8274858, + "num_input_tokens_seen": 371488768, + "router_z_loss_mlp": 0.14599609, + "step": 4480, + "time_per_iteration": 2.740461826324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065788, + "balance_loss_mlp": 1.05064893, + "epoch": 0.8620623316660254, + "flos": 649506802176.0, + "grad_norm": 0.09178637481002815, + "language_loss": 0.7409358, + "learning_rate": 4.9072370102269226e-05, + "loss": 0.75159371, + "num_input_tokens_seen": 371560144, + "router_z_loss_mlp": 0.15124512, + "step": 4481, + "time_per_iteration": 2.8202409744262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070778, + "balance_loss_mlp": 1.05633044, + "epoch": 0.862254713351289, + "flos": 751781523456.0, + "grad_norm": 0.07336978506416574, + "language_loss": 0.85627228, + "learning_rate": 4.893785943464801e-05, + "loss": 0.86698008, + "num_input_tokens_seen": 371635920, + "router_z_loss_mlp": 0.14428711, + "step": 4482, + "time_per_iteration": 2.9723026752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072478, + "balance_loss_mlp": 1.05771959, + "epoch": 0.8624470950365525, + "flos": 841543727616.0, + "grad_norm": 0.06427731204985579, + "language_loss": 0.77644771, + "learning_rate": 4.880352388488024e-05, + "loss": 0.7871725, + "num_input_tokens_seen": 371727664, + "router_z_loss_mlp": 0.14746094, + "step": 4483, + "time_per_iteration": 3.2497451305389404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072184, + "balance_loss_mlp": 1.05741429, + "epoch": 0.8626394767218161, + "flos": 754793468928.0, + "grad_norm": 0.0734090196676215, + "language_loss": 0.83015764, + "learning_rate": 4.866936350511969e-05, + "loss": 0.84087956, + "num_input_tokens_seen": 371800832, + "router_z_loss_mlp": 0.14746094, + "step": 4484, + "time_per_iteration": 2.8956780433654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075352, + "balance_loss_mlp": 1.06092763, + "epoch": 0.8628318584070797, + "flos": 703585626624.0, + "grad_norm": 0.06806275994397937, + "language_loss": 0.82180882, + "learning_rate": 4.853537834745203e-05, + "loss": 0.83256233, + "num_input_tokens_seen": 371871472, + "router_z_loss_mlp": 0.14404297, + "step": 4485, + "time_per_iteration": 2.9138083457946777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066713, + "balance_loss_mlp": 1.05188346, + "epoch": 0.8630242400923432, + "flos": 471244428288.0, + "grad_norm": 0.06130669140351844, + "language_loss": 0.77192688, + "learning_rate": 4.840156846389487e-05, + "loss": 0.78259403, + "num_input_tokens_seen": 371936512, + "router_z_loss_mlp": 0.14807129, + "step": 4486, + "time_per_iteration": 2.5923945903778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068939, + "balance_loss_mlp": 1.05420458, + "epoch": 0.8632166217776067, + "flos": 964363553280.0, + "grad_norm": 0.09142848805617776, + "language_loss": 0.77645731, + "learning_rate": 4.826793390639783e-05, + "loss": 0.78714675, + "num_input_tokens_seen": 372018032, + "router_z_loss_mlp": 0.14697266, + "step": 4487, + "time_per_iteration": 3.2063825130462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066737, + "balance_loss_mlp": 1.05182362, + "epoch": 0.8634090034628703, + "flos": 767913509376.0, + "grad_norm": 0.07487665113796628, + "language_loss": 0.78699821, + "learning_rate": 4.813447472684246e-05, + "loss": 0.7976656, + "num_input_tokens_seen": 372092176, + "router_z_loss_mlp": 0.14880371, + "step": 4488, + "time_per_iteration": 3.005026340484619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069915, + "balance_loss_mlp": 1.05519223, + "epoch": 0.8636013851481339, + "flos": 520591504896.0, + "grad_norm": 0.07136180617558878, + "language_loss": 0.8320052, + "learning_rate": 4.800119097704214e-05, + "loss": 0.8427043, + "num_input_tokens_seen": 372166880, + "router_z_loss_mlp": 0.14697266, + "step": 4489, + "time_per_iteration": 2.7364392280578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067253, + "balance_loss_mlp": 1.05310261, + "epoch": 0.8637937668333975, + "flos": 632144342016.0, + "grad_norm": 0.08078555791149708, + "language_loss": 0.80594444, + "learning_rate": 4.7868082708742324e-05, + "loss": 0.81661701, + "num_input_tokens_seen": 372234608, + "router_z_loss_mlp": 0.14135742, + "step": 4490, + "time_per_iteration": 2.7436652183532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067307, + "balance_loss_mlp": 1.05278766, + "epoch": 0.8639861485186611, + "flos": 856094676480.0, + "grad_norm": 0.12060339505019638, + "language_loss": 0.76427901, + "learning_rate": 4.773514997362e-05, + "loss": 0.77495205, + "num_input_tokens_seen": 372314704, + "router_z_loss_mlp": 0.1451416, + "step": 4491, + "time_per_iteration": 3.0809972286224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107324, + "balance_loss_mlp": 1.05887485, + "epoch": 0.8641785302039245, + "flos": 481261118976.0, + "grad_norm": 0.07217644501774635, + "language_loss": 0.77776736, + "learning_rate": 4.7602392823284605e-05, + "loss": 0.78849971, + "num_input_tokens_seen": 372374848, + "router_z_loss_mlp": 0.14355469, + "step": 4492, + "time_per_iteration": 2.5654242038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068525, + "balance_loss_mlp": 1.05417252, + "epoch": 0.8643709118891881, + "flos": 504637558272.0, + "grad_norm": 0.06656380617407046, + "language_loss": 0.80193943, + "learning_rate": 4.746981130927675e-05, + "loss": 0.81262463, + "num_input_tokens_seen": 372442432, + "router_z_loss_mlp": 0.14355469, + "step": 4493, + "time_per_iteration": 2.670027017593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069512, + "balance_loss_mlp": 1.0553143, + "epoch": 0.8645632935744517, + "flos": 552368719872.0, + "grad_norm": 0.07682965600058904, + "language_loss": 0.82227212, + "learning_rate": 4.733740548306908e-05, + "loss": 0.83296728, + "num_input_tokens_seen": 372520048, + "router_z_loss_mlp": 0.1418457, + "step": 4494, + "time_per_iteration": 2.8134214878082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065325, + "balance_loss_mlp": 1.05059028, + "epoch": 0.8647556752597153, + "flos": 524737751040.0, + "grad_norm": 0.07423694225628534, + "language_loss": 0.83722866, + "learning_rate": 4.7205175396066336e-05, + "loss": 0.84788191, + "num_input_tokens_seen": 372587968, + "router_z_loss_mlp": 0.14709473, + "step": 4495, + "time_per_iteration": 2.6327974796295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073184, + "balance_loss_mlp": 1.058617, + "epoch": 0.8649480569449788, + "flos": 787768851456.0, + "grad_norm": 0.07327759131126368, + "language_loss": 0.82331359, + "learning_rate": 4.707312109960471e-05, + "loss": 0.83404541, + "num_input_tokens_seen": 372672544, + "router_z_loss_mlp": 0.14550781, + "step": 4496, + "time_per_iteration": 3.0912046432495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069634, + "balance_loss_mlp": 1.05510235, + "epoch": 0.8651404386302424, + "flos": 763863810048.0, + "grad_norm": 0.07168527754469435, + "language_loss": 0.76572919, + "learning_rate": 4.694124264495225e-05, + "loss": 0.77642548, + "num_input_tokens_seen": 372751296, + "router_z_loss_mlp": 0.14526367, + "step": 4497, + "time_per_iteration": 3.043983221054077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067126, + "balance_loss_mlp": 1.05242729, + "epoch": 0.865332820315506, + "flos": 539893651968.0, + "grad_norm": 0.06672148584228833, + "language_loss": 0.82233298, + "learning_rate": 4.680954008330851e-05, + "loss": 0.83300424, + "num_input_tokens_seen": 372825264, + "router_z_loss_mlp": 0.14685059, + "step": 4498, + "time_per_iteration": 2.719404697418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009012, + "balance_loss_mlp": 1.0026226, + "epoch": 0.8655252020007695, + "flos": 1476632830464.0, + "grad_norm": 0.004886545059894445, + "language_loss": 0.79174447, + "learning_rate": 4.667801346580519e-05, + "loss": 0.80183458, + "num_input_tokens_seen": 373052000, + "router_z_loss_mlp": 0.06396484, + "step": 4499, + "time_per_iteration": 4.798980474472046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062842, + "balance_loss_mlp": 1.0486083, + "epoch": 0.8657175836860331, + "flos": 517369586688.0, + "grad_norm": 0.08270654530250093, + "language_loss": 0.82950461, + "learning_rate": 4.6546662843505396e-05, + "loss": 0.84013307, + "num_input_tokens_seen": 373124128, + "router_z_loss_mlp": 0.14233398, + "step": 4500, + "time_per_iteration": 2.731417179107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106723, + "balance_loss_mlp": 1.05263877, + "epoch": 0.8659099653712966, + "flos": 590523333120.0, + "grad_norm": 0.07191381207287514, + "language_loss": 0.80231231, + "learning_rate": 4.641548826740394e-05, + "loss": 0.81298465, + "num_input_tokens_seen": 373195472, + "router_z_loss_mlp": 0.14575195, + "step": 4501, + "time_per_iteration": 2.697899341583252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062306, + "balance_loss_mlp": 1.04800117, + "epoch": 0.8661023470565602, + "flos": 590449181184.0, + "grad_norm": 0.05594849429502133, + "language_loss": 0.87944901, + "learning_rate": 4.628448978842731e-05, + "loss": 0.89007205, + "num_input_tokens_seen": 373273504, + "router_z_loss_mlp": 0.14282227, + "step": 4502, + "time_per_iteration": 2.8466720581054688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064504, + "balance_loss_mlp": 1.04962659, + "epoch": 0.8662947287418238, + "flos": 567670726656.0, + "grad_norm": 0.06639072474575029, + "language_loss": 0.79237312, + "learning_rate": 4.61536674574336e-05, + "loss": 0.80301815, + "num_input_tokens_seen": 373346032, + "router_z_loss_mlp": 0.1484375, + "step": 4503, + "time_per_iteration": 2.7786972522735596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072513, + "balance_loss_mlp": 1.05816054, + "epoch": 0.8664871104270874, + "flos": 515929139712.0, + "grad_norm": 0.05596301898353544, + "language_loss": 0.82147396, + "learning_rate": 4.6023021325212636e-05, + "loss": 0.8321991, + "num_input_tokens_seen": 373419968, + "router_z_loss_mlp": 0.14355469, + "step": 4504, + "time_per_iteration": 2.7921864986419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075701, + "balance_loss_mlp": 1.06135976, + "epoch": 0.866679492112351, + "flos": 557263452672.0, + "grad_norm": 0.07445535583019337, + "language_loss": 0.78300965, + "learning_rate": 4.589255144248561e-05, + "loss": 0.79376662, + "num_input_tokens_seen": 373502448, + "router_z_loss_mlp": 0.14343262, + "step": 4505, + "time_per_iteration": 2.857663154602051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062777, + "balance_loss_mlp": 1.04850721, + "epoch": 0.8668718737976144, + "flos": 722448004608.0, + "grad_norm": 0.09695588327061085, + "language_loss": 0.81800681, + "learning_rate": 4.57622578599054e-05, + "loss": 0.82863462, + "num_input_tokens_seen": 373581184, + "router_z_loss_mlp": 0.14257812, + "step": 4506, + "time_per_iteration": 2.929633855819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065238, + "balance_loss_mlp": 1.050861, + "epoch": 0.867064255482878, + "flos": 600705580032.0, + "grad_norm": 0.07502570041453936, + "language_loss": 0.84632653, + "learning_rate": 4.5632140628056705e-05, + "loss": 0.85697895, + "num_input_tokens_seen": 373652272, + "router_z_loss_mlp": 0.14367676, + "step": 4507, + "time_per_iteration": 2.7329187393188477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068231, + "balance_loss_mlp": 1.05338943, + "epoch": 0.8672566371681416, + "flos": 803527879680.0, + "grad_norm": 0.07513188405638076, + "language_loss": 0.76312721, + "learning_rate": 4.550219979745529e-05, + "loss": 0.77380955, + "num_input_tokens_seen": 373734896, + "router_z_loss_mlp": 0.14831543, + "step": 4508, + "time_per_iteration": 3.0379912853240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064243, + "balance_loss_mlp": 1.04998589, + "epoch": 0.8674490188534052, + "flos": 627368177664.0, + "grad_norm": 0.061997847025714266, + "language_loss": 0.83527964, + "learning_rate": 4.5372435418548905e-05, + "loss": 0.84592211, + "num_input_tokens_seen": 373806960, + "router_z_loss_mlp": 0.14257812, + "step": 4509, + "time_per_iteration": 2.7606923580169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071433, + "balance_loss_mlp": 1.05680609, + "epoch": 0.8676414005386687, + "flos": 727831692288.0, + "grad_norm": 0.06323363214772874, + "language_loss": 0.86261082, + "learning_rate": 4.524284754171615e-05, + "loss": 0.87332517, + "num_input_tokens_seen": 373888352, + "router_z_loss_mlp": 0.14624023, + "step": 4510, + "time_per_iteration": 2.975365161895752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106234, + "balance_loss_mlp": 1.04809463, + "epoch": 0.8678337822239323, + "flos": 539972573184.0, + "grad_norm": 0.08527155425852233, + "language_loss": 0.80449998, + "learning_rate": 4.5113436217267765e-05, + "loss": 0.81512344, + "num_input_tokens_seen": 373962112, + "router_z_loss_mlp": 0.14257812, + "step": 4511, + "time_per_iteration": 2.7997350692749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067767, + "balance_loss_mlp": 1.05356872, + "epoch": 0.8680261639091958, + "flos": 507521023488.0, + "grad_norm": 0.13843018607601695, + "language_loss": 0.79428059, + "learning_rate": 4.4984201495445744e-05, + "loss": 0.80495822, + "num_input_tokens_seen": 374028256, + "router_z_loss_mlp": 0.14196777, + "step": 4512, + "time_per_iteration": 2.6306517124176025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066222, + "balance_loss_mlp": 1.05204797, + "epoch": 0.8682185455944594, + "flos": 487126794240.0, + "grad_norm": 0.0652028208920273, + "language_loss": 0.80685651, + "learning_rate": 4.4855143426423275e-05, + "loss": 0.81751871, + "num_input_tokens_seen": 374100080, + "router_z_loss_mlp": 0.14160156, + "step": 4513, + "time_per_iteration": 2.631469488143921 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063365, + "balance_loss_mlp": 1.04905963, + "epoch": 0.868410927279723, + "flos": 603690361344.0, + "grad_norm": 0.08387419334599636, + "language_loss": 0.80628252, + "learning_rate": 4.472626206030528e-05, + "loss": 0.81691617, + "num_input_tokens_seen": 374174368, + "router_z_loss_mlp": 0.14306641, + "step": 4514, + "time_per_iteration": 2.703423500061035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061634, + "balance_loss_mlp": 1.04734087, + "epoch": 0.8686033089649865, + "flos": 1118985186816.0, + "grad_norm": 0.09897046417963085, + "language_loss": 0.84731203, + "learning_rate": 4.4597557447127846e-05, + "loss": 0.8579284, + "num_input_tokens_seen": 374257328, + "router_z_loss_mlp": 0.14294434, + "step": 4515, + "time_per_iteration": 3.3720173835754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066888, + "balance_loss_mlp": 1.05241561, + "epoch": 0.8687956906502501, + "flos": 568019091456.0, + "grad_norm": 0.0907599826984789, + "language_loss": 0.83635509, + "learning_rate": 4.446902963685862e-05, + "loss": 0.84702396, + "num_input_tokens_seen": 374327936, + "router_z_loss_mlp": 0.14477539, + "step": 4516, + "time_per_iteration": 2.661489248275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065423, + "balance_loss_mlp": 1.05126095, + "epoch": 0.8689880723355137, + "flos": 544338703872.0, + "grad_norm": 0.07393998563485746, + "language_loss": 0.84213966, + "learning_rate": 4.4340678679396454e-05, + "loss": 0.85279387, + "num_input_tokens_seen": 374400496, + "router_z_loss_mlp": 0.14147949, + "step": 4517, + "time_per_iteration": 2.6653032302856445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060388, + "balance_loss_mlp": 1.0461185, + "epoch": 0.8691804540207773, + "flos": 457425086976.0, + "grad_norm": 0.06604754352210267, + "language_loss": 0.86236376, + "learning_rate": 4.4212504624571495e-05, + "loss": 0.8729676, + "num_input_tokens_seen": 374470528, + "router_z_loss_mlp": 0.14257812, + "step": 4518, + "time_per_iteration": 2.6518642902374268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065916, + "balance_loss_mlp": 1.05138481, + "epoch": 0.8693728357060407, + "flos": 591872375808.0, + "grad_norm": 0.07030672265979203, + "language_loss": 0.80032271, + "learning_rate": 4.40845075221456e-05, + "loss": 0.81098187, + "num_input_tokens_seen": 374542656, + "router_z_loss_mlp": 0.1451416, + "step": 4519, + "time_per_iteration": 2.747318983078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061664, + "balance_loss_mlp": 1.04732347, + "epoch": 0.8695652173913043, + "flos": 680263515648.0, + "grad_norm": 0.08647711419457829, + "language_loss": 0.7937988, + "learning_rate": 4.395668742181164e-05, + "loss": 0.80441546, + "num_input_tokens_seen": 374617232, + "router_z_loss_mlp": 0.14318848, + "step": 4520, + "time_per_iteration": 2.8835909366607666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.05213761, + "epoch": 0.8697575990765679, + "flos": 492362551296.0, + "grad_norm": 0.0756040162570651, + "language_loss": 0.78086627, + "learning_rate": 4.38290443731934e-05, + "loss": 0.79153037, + "num_input_tokens_seen": 374681888, + "router_z_loss_mlp": 0.14257812, + "step": 4521, + "time_per_iteration": 2.5724833011627197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066494, + "balance_loss_mlp": 1.05209351, + "epoch": 0.8699499807618315, + "flos": 526949079552.0, + "grad_norm": 0.062480964319909835, + "language_loss": 0.81658232, + "learning_rate": 4.370157842584671e-05, + "loss": 0.82724726, + "num_input_tokens_seen": 374750464, + "router_z_loss_mlp": 0.14404297, + "step": 4522, + "time_per_iteration": 2.6957974433898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065943, + "balance_loss_mlp": 1.05160189, + "epoch": 0.8701423624470951, + "flos": 814342616064.0, + "grad_norm": 0.06768287451120002, + "language_loss": 0.80298173, + "learning_rate": 4.357428962925808e-05, + "loss": 0.81364119, + "num_input_tokens_seen": 374836064, + "router_z_loss_mlp": 0.14331055, + "step": 4523, + "time_per_iteration": 3.1663365364074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064286, + "balance_loss_mlp": 1.04987335, + "epoch": 0.8703347441323586, + "flos": 556789178880.0, + "grad_norm": 0.06671589316380268, + "language_loss": 0.88140607, + "learning_rate": 4.344717803284542e-05, + "loss": 0.89204895, + "num_input_tokens_seen": 374903392, + "router_z_loss_mlp": 0.14416504, + "step": 4524, + "time_per_iteration": 2.6627633571624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064956, + "balance_loss_mlp": 1.04983997, + "epoch": 0.8705271258176221, + "flos": 585443220480.0, + "grad_norm": 0.06181301750116614, + "language_loss": 0.84106493, + "learning_rate": 4.3320243685957825e-05, + "loss": 0.85171449, + "num_input_tokens_seen": 374985904, + "router_z_loss_mlp": 0.15100098, + "step": 4525, + "time_per_iteration": 2.813011646270752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106471, + "balance_loss_mlp": 1.04989266, + "epoch": 0.8707195075028857, + "flos": 669216411648.0, + "grad_norm": 0.06605227762602037, + "language_loss": 0.8533206, + "learning_rate": 4.3193486637875536e-05, + "loss": 0.86396778, + "num_input_tokens_seen": 375062992, + "router_z_loss_mlp": 0.14794922, + "step": 4526, + "time_per_iteration": 2.8933169841766357 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065001, + "balance_loss_mlp": 1.05061281, + "epoch": 0.8709118891881493, + "flos": 520391443968.0, + "grad_norm": 0.060370068078767804, + "language_loss": 0.83663857, + "learning_rate": 4.306690693781007e-05, + "loss": 0.84728855, + "num_input_tokens_seen": 375139296, + "router_z_loss_mlp": 0.1439209, + "step": 4527, + "time_per_iteration": 2.761434555053711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064419, + "balance_loss_mlp": 1.04992294, + "epoch": 0.8711042708734128, + "flos": 553208984064.0, + "grad_norm": 0.08414030206759188, + "language_loss": 0.81535316, + "learning_rate": 4.294050463490401e-05, + "loss": 0.82599723, + "num_input_tokens_seen": 375206576, + "router_z_loss_mlp": 0.14489746, + "step": 4528, + "time_per_iteration": 2.650632619857788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_mlp": 1.04762089, + "epoch": 0.8712966525586764, + "flos": 502193862144.0, + "grad_norm": 0.09478165614322998, + "language_loss": 0.81905985, + "learning_rate": 4.281427977823094e-05, + "loss": 0.82968199, + "num_input_tokens_seen": 375279008, + "router_z_loss_mlp": 0.14587402, + "step": 4529, + "time_per_iteration": 2.7222495079040527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106655, + "balance_loss_mlp": 1.05204225, + "epoch": 0.87148903424394, + "flos": 804096129024.0, + "grad_norm": 0.09748177574761158, + "language_loss": 0.73896039, + "learning_rate": 4.268823241679593e-05, + "loss": 0.74962586, + "num_input_tokens_seen": 375368512, + "router_z_loss_mlp": 0.14489746, + "step": 4530, + "time_per_iteration": 3.050337791442871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065647, + "balance_loss_mlp": 1.05079401, + "epoch": 0.8716814159292036, + "flos": 773438160384.0, + "grad_norm": 0.0689062748020189, + "language_loss": 0.86388242, + "learning_rate": 4.256236259953489e-05, + "loss": 0.8745389, + "num_input_tokens_seen": 375450528, + "router_z_loss_mlp": 0.14831543, + "step": 4531, + "time_per_iteration": 3.0478785037994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065486, + "balance_loss_mlp": 1.05087137, + "epoch": 0.8718737976144671, + "flos": 486835329024.0, + "grad_norm": 0.08577279593283388, + "language_loss": 0.85180438, + "learning_rate": 4.243667037531468e-05, + "loss": 0.86245918, + "num_input_tokens_seen": 375518256, + "router_z_loss_mlp": 0.14599609, + "step": 4532, + "time_per_iteration": 2.6602768898010254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059568, + "balance_loss_mlp": 1.04522741, + "epoch": 0.8720661792997306, + "flos": 584123913216.0, + "grad_norm": 0.0657296857240319, + "language_loss": 0.78559881, + "learning_rate": 4.2311155792933264e-05, + "loss": 0.79619455, + "num_input_tokens_seen": 375588112, + "router_z_loss_mlp": 0.14318848, + "step": 4533, + "time_per_iteration": 2.733445644378662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007563, + "balance_loss_mlp": 1.00112557, + "epoch": 0.8722585609849942, + "flos": 1495942318080.0, + "grad_norm": 0.005568796329920205, + "language_loss": 0.80966806, + "learning_rate": 4.2185818901119946e-05, + "loss": 0.81974363, + "num_input_tokens_seen": 375814496, + "router_z_loss_mlp": 0.06445312, + "step": 4534, + "time_per_iteration": 4.842711925506592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063185, + "balance_loss_mlp": 1.048558, + "epoch": 0.8724509426702578, + "flos": 596169123840.0, + "grad_norm": 0.06814746037567286, + "language_loss": 0.87232822, + "learning_rate": 4.206065974853479e-05, + "loss": 0.88296002, + "num_input_tokens_seen": 375885440, + "router_z_loss_mlp": 0.14611816, + "step": 4535, + "time_per_iteration": 2.749300479888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010645, + "balance_loss_mlp": 1.04971766, + "epoch": 0.8726433243555214, + "flos": 443635481088.0, + "grad_norm": 0.0820490695559427, + "language_loss": 0.80679154, + "learning_rate": 4.193567838376888e-05, + "loss": 0.81743658, + "num_input_tokens_seen": 375952640, + "router_z_loss_mlp": 0.14758301, + "step": 4536, + "time_per_iteration": 2.553683042526245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059539, + "balance_loss_mlp": 1.04492414, + "epoch": 0.8728357060407849, + "flos": 553181819904.0, + "grad_norm": 0.08604953628210836, + "language_loss": 0.81798059, + "learning_rate": 4.181087485534402e-05, + "loss": 0.82857597, + "num_input_tokens_seen": 376021648, + "router_z_loss_mlp": 0.14611816, + "step": 4537, + "time_per_iteration": 2.6546003818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063406, + "balance_loss_mlp": 1.04877949, + "epoch": 0.8730280877260485, + "flos": 627807946752.0, + "grad_norm": 0.08278290011227846, + "language_loss": 0.78786474, + "learning_rate": 4.16862492117136e-05, + "loss": 0.79849875, + "num_input_tokens_seen": 376102304, + "router_z_loss_mlp": 0.14611816, + "step": 4538, + "time_per_iteration": 2.8178372383117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060174, + "balance_loss_mlp": 1.04570246, + "epoch": 0.873220469411312, + "flos": 535384359936.0, + "grad_norm": 0.06689995736603449, + "language_loss": 0.8018595, + "learning_rate": 4.156180150126143e-05, + "loss": 0.8124612, + "num_input_tokens_seen": 376177072, + "router_z_loss_mlp": 0.14465332, + "step": 4539, + "time_per_iteration": 2.743286371231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069578, + "balance_loss_mlp": 1.05512953, + "epoch": 0.8734128510965756, + "flos": 561883972608.0, + "grad_norm": 0.08737524822490801, + "language_loss": 0.8396098, + "learning_rate": 4.143753177230242e-05, + "loss": 0.85030556, + "num_input_tokens_seen": 376251376, + "router_z_loss_mlp": 0.14453125, + "step": 4540, + "time_per_iteration": 2.707806348800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061883, + "balance_loss_mlp": 1.04744649, + "epoch": 0.8736052327818392, + "flos": 686467643904.0, + "grad_norm": 0.06680973227686807, + "language_loss": 0.79487395, + "learning_rate": 4.131344007308224e-05, + "loss": 0.80549276, + "num_input_tokens_seen": 376337104, + "router_z_loss_mlp": 0.14416504, + "step": 4541, + "time_per_iteration": 2.9801111221313477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060786, + "balance_loss_mlp": 1.04626584, + "epoch": 0.8737976144671027, + "flos": 531673113600.0, + "grad_norm": 0.07234482564699127, + "language_loss": 0.81535935, + "learning_rate": 4.1189526451777816e-05, + "loss": 0.82596719, + "num_input_tokens_seen": 376415456, + "router_z_loss_mlp": 0.14501953, + "step": 4542, + "time_per_iteration": 2.8178179264068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062393, + "balance_loss_mlp": 1.047647, + "epoch": 0.8739899961523663, + "flos": 575592086016.0, + "grad_norm": 0.06822044709345593, + "language_loss": 0.81856036, + "learning_rate": 4.106579095649649e-05, + "loss": 0.82918429, + "num_input_tokens_seen": 376494880, + "router_z_loss_mlp": 0.14733887, + "step": 4543, + "time_per_iteration": 2.8611669540405273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059548, + "balance_loss_mlp": 1.04505205, + "epoch": 0.8741823778376299, + "flos": 731332965888.0, + "grad_norm": 0.08490003911164679, + "language_loss": 0.76622522, + "learning_rate": 4.094223363527666e-05, + "loss": 0.77682072, + "num_input_tokens_seen": 376571760, + "router_z_loss_mlp": 0.14489746, + "step": 4544, + "time_per_iteration": 2.9649460315704346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063692, + "balance_loss_mlp": 1.04885018, + "epoch": 0.8743747595228935, + "flos": 567080082432.0, + "grad_norm": 0.08047160087313358, + "language_loss": 0.83460504, + "learning_rate": 4.081885453608747e-05, + "loss": 0.84524196, + "num_input_tokens_seen": 376644464, + "router_z_loss_mlp": 0.14819336, + "step": 4545, + "time_per_iteration": 2.759756088256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064067, + "balance_loss_mlp": 1.04918993, + "epoch": 0.8745671412081569, + "flos": 493370569728.0, + "grad_norm": 0.06466903860964004, + "language_loss": 0.8239516, + "learning_rate": 4.0695653706829095e-05, + "loss": 0.83459222, + "num_input_tokens_seen": 376709584, + "router_z_loss_mlp": 0.14855957, + "step": 4546, + "time_per_iteration": 2.5922882556915283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063052, + "balance_loss_mlp": 1.0481863, + "epoch": 0.8747595228934205, + "flos": 524139766272.0, + "grad_norm": 0.06777304384321896, + "language_loss": 0.83297229, + "learning_rate": 4.057263119533233e-05, + "loss": 0.84360284, + "num_input_tokens_seen": 376779472, + "router_z_loss_mlp": 0.14831543, + "step": 4547, + "time_per_iteration": 2.626225233078003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062146, + "balance_loss_mlp": 1.0476141, + "epoch": 0.8749519045786841, + "flos": 744349118976.0, + "grad_norm": 0.07832002920068278, + "language_loss": 0.79854083, + "learning_rate": 4.044978704935853e-05, + "loss": 0.80916226, + "num_input_tokens_seen": 376863408, + "router_z_loss_mlp": 0.1451416, + "step": 4548, + "time_per_iteration": 3.0136497020721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064319, + "balance_loss_mlp": 1.04978716, + "epoch": 0.8751442862639477, + "flos": 594278995968.0, + "grad_norm": 0.0648484377907723, + "language_loss": 0.79846859, + "learning_rate": 4.032712131660027e-05, + "loss": 0.80911177, + "num_input_tokens_seen": 376942080, + "router_z_loss_mlp": 0.14538574, + "step": 4549, + "time_per_iteration": 2.8334498405456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062635, + "balance_loss_mlp": 1.04817486, + "epoch": 0.8753366679492113, + "flos": 496530819072.0, + "grad_norm": 0.06635734878737051, + "language_loss": 0.7858516, + "learning_rate": 4.020463404468055e-05, + "loss": 0.79647791, + "num_input_tokens_seen": 377015696, + "router_z_loss_mlp": 0.14453125, + "step": 4550, + "time_per_iteration": 2.738966941833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106303, + "balance_loss_mlp": 1.0483439, + "epoch": 0.8755290496344748, + "flos": 489864526848.0, + "grad_norm": 0.0802538221579537, + "language_loss": 0.8152554, + "learning_rate": 4.0082325281153074e-05, + "loss": 0.82588565, + "num_input_tokens_seen": 377081424, + "router_z_loss_mlp": 0.14685059, + "step": 4551, + "time_per_iteration": 2.56887149810791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060736, + "balance_loss_mlp": 1.04596615, + "epoch": 0.8757214313197383, + "flos": 591859892736.0, + "grad_norm": 0.06834289789386311, + "language_loss": 0.81667864, + "learning_rate": 3.9960195073502345e-05, + "loss": 0.82728601, + "num_input_tokens_seen": 377159360, + "router_z_loss_mlp": 0.14746094, + "step": 4552, + "time_per_iteration": 2.794102668762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062466, + "balance_loss_mlp": 1.04806566, + "epoch": 0.8759138130050019, + "flos": 976843763712.0, + "grad_norm": 0.07625032965138905, + "language_loss": 0.77863795, + "learning_rate": 3.9838243469143555e-05, + "loss": 0.78926265, + "num_input_tokens_seen": 377240704, + "router_z_loss_mlp": 0.1439209, + "step": 4553, + "time_per_iteration": 3.2093098163604736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063858, + "balance_loss_mlp": 1.04906428, + "epoch": 0.8761061946902655, + "flos": 802764338688.0, + "grad_norm": 0.062390940172138094, + "language_loss": 0.77533054, + "learning_rate": 3.971647051542243e-05, + "loss": 0.78596914, + "num_input_tokens_seen": 377324176, + "router_z_loss_mlp": 0.14770508, + "step": 4554, + "time_per_iteration": 3.070384979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106235, + "balance_loss_mlp": 1.04777074, + "epoch": 0.8762985763755291, + "flos": 698495602176.0, + "grad_norm": 0.06693574934874094, + "language_loss": 0.74468589, + "learning_rate": 3.95948762596155e-05, + "loss": 0.7553094, + "num_input_tokens_seen": 377403440, + "router_z_loss_mlp": 0.14562988, + "step": 4555, + "time_per_iteration": 2.9657835960388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061359, + "balance_loss_mlp": 1.04670799, + "epoch": 0.8764909580607926, + "flos": 629717898240.0, + "grad_norm": 0.07988560092503469, + "language_loss": 0.80092323, + "learning_rate": 3.9473460748929765e-05, + "loss": 0.81153679, + "num_input_tokens_seen": 377483440, + "router_z_loss_mlp": 0.14648438, + "step": 4556, + "time_per_iteration": 2.8684329986572266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064834, + "balance_loss_mlp": 1.050565, + "epoch": 0.8766833397460562, + "flos": 481545243648.0, + "grad_norm": 0.08423746847970588, + "language_loss": 0.80034041, + "learning_rate": 3.935222403050304e-05, + "loss": 0.81098878, + "num_input_tokens_seen": 377554688, + "router_z_loss_mlp": 0.1427002, + "step": 4557, + "time_per_iteration": 2.7271201610565186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067348, + "balance_loss_mlp": 1.05256641, + "epoch": 0.8768757214313198, + "flos": 407734414848.0, + "grad_norm": 0.07266214938945337, + "language_loss": 0.78330112, + "learning_rate": 3.923116615140354e-05, + "loss": 0.79397452, + "num_input_tokens_seen": 377617616, + "router_z_loss_mlp": 0.14758301, + "step": 4558, + "time_per_iteration": 2.534062147140503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064084, + "balance_loss_mlp": 1.04936194, + "epoch": 0.8770681031165833, + "flos": 582582150144.0, + "grad_norm": 0.07558368157454017, + "language_loss": 0.81913722, + "learning_rate": 3.9110287158630076e-05, + "loss": 0.82977808, + "num_input_tokens_seen": 377685888, + "router_z_loss_mlp": 0.14697266, + "step": 4559, + "time_per_iteration": 2.6745707988739014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065429, + "balance_loss_mlp": 1.05061114, + "epoch": 0.8772604848018468, + "flos": 508687257600.0, + "grad_norm": 0.08298744774134015, + "language_loss": 0.80581164, + "learning_rate": 3.8989587099111875e-05, + "loss": 0.81646591, + "num_input_tokens_seen": 377755744, + "router_z_loss_mlp": 0.14794922, + "step": 4560, + "time_per_iteration": 2.6083872318267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066798, + "balance_loss_mlp": 1.05203962, + "epoch": 0.8774528664871104, + "flos": 408836408832.0, + "grad_norm": 0.08583814592786851, + "language_loss": 0.85218108, + "learning_rate": 3.886906601970913e-05, + "loss": 0.86284906, + "num_input_tokens_seen": 377818880, + "router_z_loss_mlp": 0.14733887, + "step": 4561, + "time_per_iteration": 2.453648805618286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067415, + "balance_loss_mlp": 1.05254984, + "epoch": 0.877645248172374, + "flos": 500844819456.0, + "grad_norm": 0.06593803306167176, + "language_loss": 0.83422303, + "learning_rate": 3.8748723967212184e-05, + "loss": 0.84489715, + "num_input_tokens_seen": 377893280, + "router_z_loss_mlp": 0.14855957, + "step": 4562, + "time_per_iteration": 2.6662542819976807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063646, + "balance_loss_mlp": 1.04856586, + "epoch": 0.8778376298576376, + "flos": 633145019904.0, + "grad_norm": 0.07101645865230781, + "language_loss": 0.77801663, + "learning_rate": 3.862856098834189e-05, + "loss": 0.78865308, + "num_input_tokens_seen": 377972912, + "router_z_loss_mlp": 0.1505127, + "step": 4563, + "time_per_iteration": 2.8906450271606445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070539, + "balance_loss_mlp": 1.05560255, + "epoch": 0.8780300115429012, + "flos": 533988329472.0, + "grad_norm": 0.07397015685289171, + "language_loss": 0.8016603, + "learning_rate": 3.850857712974976e-05, + "loss": 0.81236565, + "num_input_tokens_seen": 378054000, + "router_z_loss_mlp": 0.14916992, + "step": 4564, + "time_per_iteration": 2.8398656845092773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066243, + "balance_loss_mlp": 1.05191386, + "epoch": 0.8782223932281646, + "flos": 511662127104.0, + "grad_norm": 0.06215610141963286, + "language_loss": 0.77037019, + "learning_rate": 3.838877243801758e-05, + "loss": 0.78103256, + "num_input_tokens_seen": 378120336, + "router_z_loss_mlp": 0.14331055, + "step": 4565, + "time_per_iteration": 2.606433153152466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064056, + "balance_loss_mlp": 1.04955995, + "epoch": 0.8784147749134282, + "flos": 780714547200.0, + "grad_norm": 0.08789317923638273, + "language_loss": 0.69927686, + "learning_rate": 3.826914695965766e-05, + "loss": 0.70991743, + "num_input_tokens_seen": 378216672, + "router_z_loss_mlp": 0.14489746, + "step": 4566, + "time_per_iteration": 3.193535804748535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067784, + "balance_loss_mlp": 1.05303764, + "epoch": 0.8786071565986918, + "flos": 561004434432.0, + "grad_norm": 0.10908406210790224, + "language_loss": 0.75545955, + "learning_rate": 3.814970074111279e-05, + "loss": 0.76613748, + "num_input_tokens_seen": 378287536, + "router_z_loss_mlp": 0.1472168, + "step": 4567, + "time_per_iteration": 2.7053375244140625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063765, + "balance_loss_mlp": 1.04924548, + "epoch": 0.8787995382839554, + "flos": 603448081920.0, + "grad_norm": 0.06509274087171016, + "language_loss": 0.77338004, + "learning_rate": 3.8030433828755926e-05, + "loss": 0.78401768, + "num_input_tokens_seen": 378362128, + "router_z_loss_mlp": 0.14501953, + "step": 4568, + "time_per_iteration": 2.823720693588257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067338, + "balance_loss_mlp": 1.05299711, + "epoch": 0.8789919199692189, + "flos": 560233552896.0, + "grad_norm": 0.06476749948002929, + "language_loss": 0.85155976, + "learning_rate": 3.7911346268890924e-05, + "loss": 0.86223316, + "num_input_tokens_seen": 378435696, + "router_z_loss_mlp": 0.14355469, + "step": 4569, + "time_per_iteration": 2.69594407081604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063613, + "balance_loss_mlp": 1.04903364, + "epoch": 0.8791843016544825, + "flos": 539115429888.0, + "grad_norm": 0.09405373492006784, + "language_loss": 0.81978583, + "learning_rate": 3.7792438107751405e-05, + "loss": 0.83042198, + "num_input_tokens_seen": 378505664, + "router_z_loss_mlp": 0.14575195, + "step": 4570, + "time_per_iteration": 2.7766315937042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066864, + "balance_loss_mlp": 1.05215406, + "epoch": 0.8793766833397461, + "flos": 1008699899904.0, + "grad_norm": 0.06533116595538893, + "language_loss": 0.79086006, + "learning_rate": 3.767370939150167e-05, + "loss": 0.80152869, + "num_input_tokens_seen": 378598016, + "router_z_loss_mlp": 0.14709473, + "step": 4571, + "time_per_iteration": 3.391723871231079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_mlp": 1.04999161, + "epoch": 0.8795690650250096, + "flos": 678637688832.0, + "grad_norm": 0.06515918314905815, + "language_loss": 0.81039464, + "learning_rate": 3.755516016623628e-05, + "loss": 0.82103866, + "num_input_tokens_seen": 378676176, + "router_z_loss_mlp": 0.14404297, + "step": 4572, + "time_per_iteration": 2.877964496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065039, + "balance_loss_mlp": 1.05043602, + "epoch": 0.8797614467102732, + "flos": 453432287232.0, + "grad_norm": 0.07838900846740328, + "language_loss": 0.88639665, + "learning_rate": 3.7436790477980157e-05, + "loss": 0.8970471, + "num_input_tokens_seen": 378737952, + "router_z_loss_mlp": 0.14575195, + "step": 4573, + "time_per_iteration": 2.562926769256592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062634, + "balance_loss_mlp": 1.0480783, + "epoch": 0.8799538283955367, + "flos": 550913591808.0, + "grad_norm": 0.06634304029009142, + "language_loss": 0.84095144, + "learning_rate": 3.7318600372688526e-05, + "loss": 0.85157776, + "num_input_tokens_seen": 378806704, + "router_z_loss_mlp": 0.14538574, + "step": 4574, + "time_per_iteration": 2.6689820289611816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068241, + "balance_loss_mlp": 1.05397153, + "epoch": 0.8801462100808003, + "flos": 807429275136.0, + "grad_norm": 0.08408376547428717, + "language_loss": 0.84203458, + "learning_rate": 3.720058989624681e-05, + "loss": 0.85271698, + "num_input_tokens_seen": 378887616, + "router_z_loss_mlp": 0.1427002, + "step": 4575, + "time_per_iteration": 3.06958270072937 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069659, + "balance_loss_mlp": 1.05469871, + "epoch": 0.8803385917660639, + "flos": 768694302720.0, + "grad_norm": 0.06560709355959533, + "language_loss": 0.84476829, + "learning_rate": 3.708275909447079e-05, + "loss": 0.85546494, + "num_input_tokens_seen": 378964656, + "router_z_loss_mlp": 0.14941406, + "step": 4576, + "time_per_iteration": 2.9229040145874023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067971, + "balance_loss_mlp": 1.05327201, + "epoch": 0.8805309734513275, + "flos": 567339614208.0, + "grad_norm": 0.06356861295382751, + "language_loss": 0.81037927, + "learning_rate": 3.696510801310632e-05, + "loss": 0.82105893, + "num_input_tokens_seen": 379036752, + "router_z_loss_mlp": 0.14685059, + "step": 4577, + "time_per_iteration": 2.735290765762329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068874, + "balance_loss_mlp": 1.05421138, + "epoch": 0.880723355136591, + "flos": 679779330048.0, + "grad_norm": 0.07286316970096472, + "language_loss": 0.81711239, + "learning_rate": 3.6847636697829755e-05, + "loss": 0.82780111, + "num_input_tokens_seen": 379106480, + "router_z_loss_mlp": 0.1463623, + "step": 4578, + "time_per_iteration": 2.812211275100708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065505, + "balance_loss_mlp": 1.05084252, + "epoch": 0.8809157368218545, + "flos": 565629723648.0, + "grad_norm": 0.06978735533151822, + "language_loss": 0.79132414, + "learning_rate": 3.673034519424734e-05, + "loss": 0.80197918, + "num_input_tokens_seen": 379182544, + "router_z_loss_mlp": 0.1463623, + "step": 4579, + "time_per_iteration": 2.7452139854431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067759, + "balance_loss_mlp": 1.05331051, + "epoch": 0.8811081185071181, + "flos": 515407878144.0, + "grad_norm": 0.07097224147621632, + "language_loss": 0.76073337, + "learning_rate": 3.661323354789586e-05, + "loss": 0.77141094, + "num_input_tokens_seen": 379255856, + "router_z_loss_mlp": 0.14440918, + "step": 4580, + "time_per_iteration": 2.6742916107177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066849, + "balance_loss_mlp": 1.05221033, + "epoch": 0.8813005001923817, + "flos": 594343236096.0, + "grad_norm": 0.11678051247214369, + "language_loss": 0.81309009, + "learning_rate": 3.649630180424191e-05, + "loss": 0.8237586, + "num_input_tokens_seen": 379322704, + "router_z_loss_mlp": 0.1463623, + "step": 4581, + "time_per_iteration": 2.676151752471924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106574, + "balance_loss_mlp": 1.05135107, + "epoch": 0.8814928818776453, + "flos": 666940843008.0, + "grad_norm": 0.07866838173150745, + "language_loss": 0.78949201, + "learning_rate": 3.637955000868254e-05, + "loss": 0.80014944, + "num_input_tokens_seen": 379395008, + "router_z_loss_mlp": 0.1439209, + "step": 4582, + "time_per_iteration": 2.841001510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064783, + "balance_loss_mlp": 1.0505619, + "epoch": 0.8816852635629088, + "flos": 609153343488.0, + "grad_norm": 0.08084171003417935, + "language_loss": 0.85922098, + "learning_rate": 3.626297820654467e-05, + "loss": 0.86986876, + "num_input_tokens_seen": 379465824, + "router_z_loss_mlp": 0.14221191, + "step": 4583, + "time_per_iteration": 2.817744016647339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067996, + "balance_loss_mlp": 1.05326128, + "epoch": 0.8818776452481724, + "flos": 480379009536.0, + "grad_norm": 0.08737806044600016, + "language_loss": 0.81773436, + "learning_rate": 3.614658644308572e-05, + "loss": 0.82841432, + "num_input_tokens_seen": 379534960, + "router_z_loss_mlp": 0.14709473, + "step": 4584, + "time_per_iteration": 2.697969913482666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073178, + "balance_loss_mlp": 1.05840755, + "epoch": 0.882070026933436, + "flos": 1045394242560.0, + "grad_norm": 0.07560542968481543, + "language_loss": 0.73435783, + "learning_rate": 3.60303747634928e-05, + "loss": 0.74508959, + "num_input_tokens_seen": 379617456, + "router_z_loss_mlp": 0.14758301, + "step": 4585, + "time_per_iteration": 3.3223116397857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062597, + "balance_loss_mlp": 1.04780316, + "epoch": 0.8822624086186995, + "flos": 474409446912.0, + "grad_norm": 0.06954120995359621, + "language_loss": 0.79935622, + "learning_rate": 3.591434321288345e-05, + "loss": 0.80998224, + "num_input_tokens_seen": 379687792, + "router_z_loss_mlp": 0.14770508, + "step": 4586, + "time_per_iteration": 2.6584787368774414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063544, + "balance_loss_mlp": 1.04922748, + "epoch": 0.882454790303963, + "flos": 654023434752.0, + "grad_norm": 0.0731006388758823, + "language_loss": 0.81770998, + "learning_rate": 3.579849183630485e-05, + "loss": 0.82834542, + "num_input_tokens_seen": 379761120, + "router_z_loss_mlp": 0.14331055, + "step": 4587, + "time_per_iteration": 2.8163564205169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062533, + "balance_loss_mlp": 1.0481801, + "epoch": 0.8826471719892266, + "flos": 470325242880.0, + "grad_norm": 0.1045221274060957, + "language_loss": 0.78476524, + "learning_rate": 3.568282067873468e-05, + "loss": 0.79539055, + "num_input_tokens_seen": 379829008, + "router_z_loss_mlp": 0.14355469, + "step": 4588, + "time_per_iteration": 2.5708115100860596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064335, + "balance_loss_mlp": 1.04931498, + "epoch": 0.8828395536744902, + "flos": 468753744384.0, + "grad_norm": 0.06849748948531013, + "language_loss": 0.83737075, + "learning_rate": 3.556732978508048e-05, + "loss": 0.84801412, + "num_input_tokens_seen": 379899584, + "router_z_loss_mlp": 0.15014648, + "step": 4589, + "time_per_iteration": 2.7350192070007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066195, + "balance_loss_mlp": 1.05163944, + "epoch": 0.8830319353597538, + "flos": 721377944064.0, + "grad_norm": 0.09265144488683381, + "language_loss": 0.81130779, + "learning_rate": 3.545201920017971e-05, + "loss": 0.82196975, + "num_input_tokens_seen": 379979440, + "router_z_loss_mlp": 0.14550781, + "step": 4590, + "time_per_iteration": 2.9506759643554688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063865, + "balance_loss_mlp": 1.04927421, + "epoch": 0.8832243170450174, + "flos": 443277204480.0, + "grad_norm": 0.07536545400384899, + "language_loss": 0.8124311, + "learning_rate": 3.5336888968799996e-05, + "loss": 0.82306975, + "num_input_tokens_seen": 380046944, + "router_z_loss_mlp": 0.14599609, + "step": 4591, + "time_per_iteration": 2.568519353866577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066816, + "balance_loss_mlp": 1.05218911, + "epoch": 0.8834166987302808, + "flos": 566583413760.0, + "grad_norm": 0.07974593182180129, + "language_loss": 0.82008839, + "learning_rate": 3.5221939135638756e-05, + "loss": 0.83075655, + "num_input_tokens_seen": 380118048, + "router_z_loss_mlp": 0.14611816, + "step": 4592, + "time_per_iteration": 2.74800968170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064472, + "balance_loss_mlp": 1.04994082, + "epoch": 0.8836090804155444, + "flos": 609316328448.0, + "grad_norm": 0.08282529824241759, + "language_loss": 0.81985712, + "learning_rate": 3.510716974532352e-05, + "loss": 0.83050191, + "num_input_tokens_seen": 380192416, + "router_z_loss_mlp": 0.14538574, + "step": 4593, + "time_per_iteration": 2.797036647796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062162, + "balance_loss_mlp": 1.04726076, + "epoch": 0.883801462100808, + "flos": 557065963008.0, + "grad_norm": 0.07056382399826802, + "language_loss": 0.8015058, + "learning_rate": 3.4992580842411745e-05, + "loss": 0.81212735, + "num_input_tokens_seen": 380264432, + "router_z_loss_mlp": 0.14880371, + "step": 4594, + "time_per_iteration": 2.7425427436828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062451, + "balance_loss_mlp": 1.04733491, + "epoch": 0.8839938437860716, + "flos": 516188671488.0, + "grad_norm": 0.07638624287700241, + "language_loss": 0.77165449, + "learning_rate": 3.487817247139064e-05, + "loss": 0.78227895, + "num_input_tokens_seen": 380334192, + "router_z_loss_mlp": 0.15100098, + "step": 4595, + "time_per_iteration": 2.6234378814697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058543, + "balance_loss_mlp": 1.04419065, + "epoch": 0.8841862254713351, + "flos": 713696292864.0, + "grad_norm": 0.06917902980564926, + "language_loss": 0.78930062, + "learning_rate": 3.47639446766777e-05, + "loss": 0.79988611, + "num_input_tokens_seen": 380407504, + "router_z_loss_mlp": 0.14343262, + "step": 4596, + "time_per_iteration": 2.9058618545532227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010623, + "balance_loss_mlp": 1.04753017, + "epoch": 0.8843786071565987, + "flos": 833975875584.0, + "grad_norm": 0.0690866392470046, + "language_loss": 0.82326406, + "learning_rate": 3.4649897502620095e-05, + "loss": 0.8338871, + "num_input_tokens_seen": 380486272, + "router_z_loss_mlp": 0.14746094, + "step": 4597, + "time_per_iteration": 3.100800037384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064515, + "balance_loss_mlp": 1.04985189, + "epoch": 0.8845709888418622, + "flos": 656884505088.0, + "grad_norm": 0.058588985333644296, + "language_loss": 0.82849264, + "learning_rate": 3.453603099349462e-05, + "loss": 0.83913779, + "num_input_tokens_seen": 380568480, + "router_z_loss_mlp": 0.14648438, + "step": 4598, + "time_per_iteration": 2.9068336486816406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060442, + "balance_loss_mlp": 1.0461247, + "epoch": 0.8847633705271258, + "flos": 523326666240.0, + "grad_norm": 0.06896375109590577, + "language_loss": 0.80785215, + "learning_rate": 3.442234519350823e-05, + "loss": 0.81845653, + "num_input_tokens_seen": 380643088, + "router_z_loss_mlp": 0.14306641, + "step": 4599, + "time_per_iteration": 2.7556896209716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062174, + "balance_loss_mlp": 1.04742825, + "epoch": 0.8849557522123894, + "flos": 548591035392.0, + "grad_norm": 0.07253846816892973, + "language_loss": 0.84080333, + "learning_rate": 3.430884014679786e-05, + "loss": 0.85142505, + "num_input_tokens_seen": 380714512, + "router_z_loss_mlp": 0.1472168, + "step": 4600, + "time_per_iteration": 2.676515579223633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070121, + "balance_loss_mlp": 1.05523205, + "epoch": 0.8851481338976529, + "flos": 622372128768.0, + "grad_norm": 0.06699295131360646, + "language_loss": 0.83428752, + "learning_rate": 3.4195515897429974e-05, + "loss": 0.84498876, + "num_input_tokens_seen": 380789168, + "router_z_loss_mlp": 0.14868164, + "step": 4601, + "time_per_iteration": 2.7971203327178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064622, + "balance_loss_mlp": 1.0499115, + "epoch": 0.8853405155829165, + "flos": 444359374848.0, + "grad_norm": 0.05944200349893636, + "language_loss": 0.80591571, + "learning_rate": 3.408237248940088e-05, + "loss": 0.81656194, + "num_input_tokens_seen": 380856992, + "router_z_loss_mlp": 0.14672852, + "step": 4602, + "time_per_iteration": 2.5471625328063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_mlp": 1.04959369, + "epoch": 0.8855328972681801, + "flos": 730470680064.0, + "grad_norm": 0.0684300317652771, + "language_loss": 0.78215384, + "learning_rate": 3.396940996663683e-05, + "loss": 0.79279757, + "num_input_tokens_seen": 380930480, + "router_z_loss_mlp": 0.14770508, + "step": 4603, + "time_per_iteration": 2.9694807529449463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061356, + "balance_loss_mlp": 1.04694319, + "epoch": 0.8857252789534437, + "flos": 487376414208.0, + "grad_norm": 0.06851899804046666, + "language_loss": 0.7892375, + "learning_rate": 3.385662837299375e-05, + "loss": 0.79985106, + "num_input_tokens_seen": 380994192, + "router_z_loss_mlp": 0.14404297, + "step": 4604, + "time_per_iteration": 2.5907418727874756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064544, + "balance_loss_mlp": 1.0501318, + "epoch": 0.8859176606387072, + "flos": 508556206080.0, + "grad_norm": 0.07804226376806674, + "language_loss": 0.81699598, + "learning_rate": 3.374402775225727e-05, + "loss": 0.82764149, + "num_input_tokens_seen": 381066848, + "router_z_loss_mlp": 0.14404297, + "step": 4605, + "time_per_iteration": 2.71748685836792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066417, + "balance_loss_mlp": 1.05155182, + "epoch": 0.8861100423239707, + "flos": 516628440576.0, + "grad_norm": 0.07418647332566988, + "language_loss": 0.85657847, + "learning_rate": 3.3631608148142925e-05, + "loss": 0.86724257, + "num_input_tokens_seen": 381138816, + "router_z_loss_mlp": 0.14831543, + "step": 4606, + "time_per_iteration": 2.668240547180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065286, + "balance_loss_mlp": 1.05090928, + "epoch": 0.8863024240092343, + "flos": 626975396352.0, + "grad_norm": 0.07368984647111583, + "language_loss": 0.79516572, + "learning_rate": 3.3519369604295746e-05, + "loss": 0.80581862, + "num_input_tokens_seen": 381208448, + "router_z_loss_mlp": 0.14367676, + "step": 4607, + "time_per_iteration": 2.7733378410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062225, + "balance_loss_mlp": 1.04724002, + "epoch": 0.8864948056944979, + "flos": 766910260224.0, + "grad_norm": 0.06561105388045792, + "language_loss": 0.83195376, + "learning_rate": 3.340731216429083e-05, + "loss": 0.84257591, + "num_input_tokens_seen": 381289712, + "router_z_loss_mlp": 0.14953613, + "step": 4608, + "time_per_iteration": 2.9877283573150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100936, + "balance_loss_mlp": 1.00301838, + "epoch": 0.8866871873797615, + "flos": 1502331452928.0, + "grad_norm": 0.00781784977346765, + "language_loss": 0.78830957, + "learning_rate": 3.329543587163253e-05, + "loss": 0.7984032, + "num_input_tokens_seen": 381520848, + "router_z_loss_mlp": 0.06347656, + "step": 4609, + "time_per_iteration": 4.844639301300049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066515, + "balance_loss_mlp": 1.05162632, + "epoch": 0.886879569065025, + "flos": 811516050432.0, + "grad_norm": 0.08224338337652813, + "language_loss": 0.81893122, + "learning_rate": 3.3183740769755e-05, + "loss": 0.8295964, + "num_input_tokens_seen": 381603008, + "router_z_loss_mlp": 0.14868164, + "step": 4610, + "time_per_iteration": 3.0459225177764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008504, + "balance_loss_mlp": 1.00216174, + "epoch": 0.8870719507502886, + "flos": 1582838309376.0, + "grad_norm": 0.007754182988627756, + "language_loss": 0.7691083, + "learning_rate": 3.307222690202238e-05, + "loss": 0.7791934, + "num_input_tokens_seen": 381844336, + "router_z_loss_mlp": 0.06347656, + "step": 4611, + "time_per_iteration": 4.932186841964722 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065487, + "balance_loss_mlp": 1.05087233, + "epoch": 0.8872643324355521, + "flos": 634027129344.0, + "grad_norm": 0.08360001369051026, + "language_loss": 0.7498275, + "learning_rate": 3.296089431172811e-05, + "loss": 0.76048243, + "num_input_tokens_seen": 381918576, + "router_z_loss_mlp": 0.14599609, + "step": 4612, + "time_per_iteration": 2.8011648654937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061275, + "balance_loss_mlp": 1.04660034, + "epoch": 0.8874567141208157, + "flos": 535755119616.0, + "grad_norm": 0.08952554638060775, + "language_loss": 0.83154523, + "learning_rate": 3.284974304209532e-05, + "loss": 0.84215796, + "num_input_tokens_seen": 381987296, + "router_z_loss_mlp": 0.14660645, + "step": 4613, + "time_per_iteration": 2.609548330307007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062476, + "balance_loss_mlp": 1.04796779, + "epoch": 0.8876490958060793, + "flos": 1566302552064.0, + "grad_norm": 0.06343704124989273, + "language_loss": 0.79367721, + "learning_rate": 3.27387731362766e-05, + "loss": 0.80430192, + "num_input_tokens_seen": 382091744, + "router_z_loss_mlp": 0.14489746, + "step": 4614, + "time_per_iteration": 3.8918566703796387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063989, + "balance_loss_mlp": 1.04920697, + "epoch": 0.8878414774913428, + "flos": 636633810432.0, + "grad_norm": 0.06419240739581336, + "language_loss": 0.84827816, + "learning_rate": 3.2627984637354444e-05, + "loss": 0.85891807, + "num_input_tokens_seen": 382169600, + "router_z_loss_mlp": 0.14758301, + "step": 4615, + "time_per_iteration": 2.793135643005371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_mlp": 1.0492754, + "epoch": 0.8880338591766064, + "flos": 496429502976.0, + "grad_norm": 0.09309272937962464, + "language_loss": 0.81416452, + "learning_rate": 3.251737758834084e-05, + "loss": 0.82480693, + "num_input_tokens_seen": 382238336, + "router_z_loss_mlp": 0.14953613, + "step": 4616, + "time_per_iteration": 2.5885441303253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_mlp": 1.04276133, + "epoch": 0.88822624086187, + "flos": 542861180928.0, + "grad_norm": 0.06822196575636882, + "language_loss": 0.79946053, + "learning_rate": 3.2406952032177086e-05, + "loss": 0.81003511, + "num_input_tokens_seen": 382308560, + "router_z_loss_mlp": 0.14672852, + "step": 4617, + "time_per_iteration": 2.6519088745117188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058923, + "balance_loss_mlp": 1.04404545, + "epoch": 0.8884186225471336, + "flos": 551822865408.0, + "grad_norm": 0.09412059292181414, + "language_loss": 0.83855939, + "learning_rate": 3.229670801173418e-05, + "loss": 0.84914863, + "num_input_tokens_seen": 382377504, + "router_z_loss_mlp": 0.14855957, + "step": 4618, + "time_per_iteration": 2.6311991214752197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01009378, + "balance_loss_mlp": 1.0030365, + "epoch": 0.888611004232397, + "flos": 1565263305216.0, + "grad_norm": 0.009639459863935263, + "language_loss": 0.78512192, + "learning_rate": 3.218664556981288e-05, + "loss": 0.79521573, + "num_input_tokens_seen": 382615728, + "router_z_loss_mlp": 0.06347656, + "step": 4619, + "time_per_iteration": 5.020185232162476 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_mlp": 1.05006814, + "epoch": 0.8888033859176606, + "flos": 767028828672.0, + "grad_norm": 0.06660971873423523, + "language_loss": 0.8234545, + "learning_rate": 3.207676474914301e-05, + "loss": 0.8340987, + "num_input_tokens_seen": 382695552, + "router_z_loss_mlp": 0.14343262, + "step": 4620, + "time_per_iteration": 3.0802297592163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058079, + "balance_loss_mlp": 1.04369044, + "epoch": 0.8889957676029242, + "flos": 934110849024.0, + "grad_norm": 0.07396102044579353, + "language_loss": 0.84266019, + "learning_rate": 3.1967065592384105e-05, + "loss": 0.85324097, + "num_input_tokens_seen": 382775824, + "router_z_loss_mlp": 0.14379883, + "step": 4621, + "time_per_iteration": 3.201620578765869 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066988, + "balance_loss_mlp": 1.05226541, + "epoch": 0.8891881492881878, + "flos": 589611488256.0, + "grad_norm": 0.06887595273233507, + "language_loss": 0.81630599, + "learning_rate": 3.1857548142125104e-05, + "loss": 0.82697588, + "num_input_tokens_seen": 382854464, + "router_z_loss_mlp": 0.14709473, + "step": 4622, + "time_per_iteration": 2.7901487350463867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064252, + "balance_loss_mlp": 1.04972029, + "epoch": 0.8893805309734514, + "flos": 540718861824.0, + "grad_norm": 0.08404775125115564, + "language_loss": 0.82411218, + "learning_rate": 3.174821244088466e-05, + "loss": 0.83475471, + "num_input_tokens_seen": 382925088, + "router_z_loss_mlp": 0.14526367, + "step": 4623, + "time_per_iteration": 2.7144739627838135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062011, + "balance_loss_mlp": 1.04719353, + "epoch": 0.8895729126587149, + "flos": 560095160832.0, + "grad_norm": 0.07528407764846204, + "language_loss": 0.81692713, + "learning_rate": 3.163905853111054e-05, + "loss": 0.82754725, + "num_input_tokens_seen": 382998640, + "router_z_loss_mlp": 0.14794922, + "step": 4624, + "time_per_iteration": 2.684248447418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068554, + "balance_loss_mlp": 1.05377233, + "epoch": 0.8897652943439784, + "flos": 610154021376.0, + "grad_norm": 0.07526595335560629, + "language_loss": 0.81158483, + "learning_rate": 3.153008645517996e-05, + "loss": 0.82227045, + "num_input_tokens_seen": 383076000, + "router_z_loss_mlp": 0.14758301, + "step": 4625, + "time_per_iteration": 2.745210886001587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060583, + "balance_loss_mlp": 1.04557419, + "epoch": 0.889957676029242, + "flos": 917847811584.0, + "grad_norm": 0.07027542614256606, + "language_loss": 0.77104485, + "learning_rate": 3.142129625539969e-05, + "loss": 0.78165066, + "num_input_tokens_seen": 383166640, + "router_z_loss_mlp": 0.15002441, + "step": 4626, + "time_per_iteration": 3.2061305046081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067065, + "balance_loss_mlp": 1.05217612, + "epoch": 0.8901500577145056, + "flos": 488698292736.0, + "grad_norm": 0.0704878908918983, + "language_loss": 0.8078301, + "learning_rate": 3.131268797400588e-05, + "loss": 0.81850064, + "num_input_tokens_seen": 383232928, + "router_z_loss_mlp": 0.14855957, + "step": 4627, + "time_per_iteration": 2.607419013977051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061017, + "balance_loss_mlp": 1.0462352, + "epoch": 0.8903424393997691, + "flos": 733648181760.0, + "grad_norm": 0.07540128325428244, + "language_loss": 0.80532998, + "learning_rate": 3.120426165316398e-05, + "loss": 0.81594014, + "num_input_tokens_seen": 383314352, + "router_z_loss_mlp": 0.14770508, + "step": 4628, + "time_per_iteration": 3.0157666206359863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_mlp": 1.04630482, + "epoch": 0.8905348210850327, + "flos": 519813282816.0, + "grad_norm": 0.06608891713828716, + "language_loss": 0.81858778, + "learning_rate": 3.109601733496881e-05, + "loss": 0.82919651, + "num_input_tokens_seen": 383384848, + "router_z_loss_mlp": 0.14562988, + "step": 4629, + "time_per_iteration": 2.6610121726989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063368, + "balance_loss_mlp": 1.04870582, + "epoch": 0.8907272027702963, + "flos": 578976989184.0, + "grad_norm": 0.06357905643630052, + "language_loss": 0.79617715, + "learning_rate": 3.098795506144458e-05, + "loss": 0.80681086, + "num_input_tokens_seen": 383463360, + "router_z_loss_mlp": 0.14648438, + "step": 4630, + "time_per_iteration": 2.818662405014038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061916, + "balance_loss_mlp": 1.04731333, + "epoch": 0.8909195844555599, + "flos": 893628910080.0, + "grad_norm": 0.08011777081386978, + "language_loss": 0.79218996, + "learning_rate": 3.088007487454475e-05, + "loss": 0.80280912, + "num_input_tokens_seen": 383542080, + "router_z_loss_mlp": 0.14599609, + "step": 4631, + "time_per_iteration": 3.111326217651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065393, + "balance_loss_mlp": 1.0505271, + "epoch": 0.8911119661408234, + "flos": 549865926144.0, + "grad_norm": 0.07451695723155916, + "language_loss": 0.84347403, + "learning_rate": 3.077237681615208e-05, + "loss": 0.854128, + "num_input_tokens_seen": 383613056, + "router_z_loss_mlp": 0.14855957, + "step": 4632, + "time_per_iteration": 2.654611349105835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062727, + "balance_loss_mlp": 1.04776609, + "epoch": 0.8913043478260869, + "flos": 481139979264.0, + "grad_norm": 0.1272094121243378, + "language_loss": 0.83604395, + "learning_rate": 3.066486092807874e-05, + "loss": 0.84667122, + "num_input_tokens_seen": 383683280, + "router_z_loss_mlp": 0.14941406, + "step": 4633, + "time_per_iteration": 2.789865732192993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066836, + "balance_loss_mlp": 1.05250716, + "epoch": 0.8914967295113505, + "flos": 484581782016.0, + "grad_norm": 0.06541426651234629, + "language_loss": 0.85132289, + "learning_rate": 3.055752725206601e-05, + "loss": 0.86199123, + "num_input_tokens_seen": 383754624, + "router_z_loss_mlp": 0.14331055, + "step": 4634, + "time_per_iteration": 2.717449426651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062211, + "balance_loss_mlp": 1.04766774, + "epoch": 0.8916891111966141, + "flos": 445664001024.0, + "grad_norm": 0.0775971104699302, + "language_loss": 0.81119001, + "learning_rate": 3.0450375829784714e-05, + "loss": 0.82181215, + "num_input_tokens_seen": 383821984, + "router_z_loss_mlp": 0.14538574, + "step": 4635, + "time_per_iteration": 2.561291456222534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062854, + "balance_loss_mlp": 1.04850113, + "epoch": 0.8918814928818777, + "flos": 564016379904.0, + "grad_norm": 0.06339714050321119, + "language_loss": 0.78017372, + "learning_rate": 3.034340670283453e-05, + "loss": 0.7908023, + "num_input_tokens_seen": 383890880, + "router_z_loss_mlp": 0.14343262, + "step": 4636, + "time_per_iteration": 2.745828151702881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061938, + "balance_loss_mlp": 1.04759729, + "epoch": 0.8920738745671412, + "flos": 575943022080.0, + "grad_norm": 0.06775323964020447, + "language_loss": 0.81232381, + "learning_rate": 3.0236619912744513e-05, + "loss": 0.82294321, + "num_input_tokens_seen": 383962480, + "router_z_loss_mlp": 0.14343262, + "step": 4637, + "time_per_iteration": 2.7148401737213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058592, + "balance_loss_mlp": 1.04469275, + "epoch": 0.8922662562524047, + "flos": 620180623872.0, + "grad_norm": 0.07445586698610686, + "language_loss": 0.84255946, + "learning_rate": 3.0130015500973163e-05, + "loss": 0.8531453, + "num_input_tokens_seen": 384033616, + "router_z_loss_mlp": 0.13916016, + "step": 4638, + "time_per_iteration": 2.7227747440338135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065871, + "balance_loss_mlp": 1.05116081, + "epoch": 0.8924586379376683, + "flos": 583624673280.0, + "grad_norm": 0.06905475255164643, + "language_loss": 0.79193419, + "learning_rate": 3.0023593508907877e-05, + "loss": 0.80259293, + "num_input_tokens_seen": 384108848, + "router_z_loss_mlp": 0.14709473, + "step": 4639, + "time_per_iteration": 2.788235664367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061668, + "balance_loss_mlp": 1.04706538, + "epoch": 0.8926510196229319, + "flos": 525177520128.0, + "grad_norm": 0.05686103058965172, + "language_loss": 0.81477505, + "learning_rate": 2.991735397786538e-05, + "loss": 0.82539171, + "num_input_tokens_seen": 384185728, + "router_z_loss_mlp": 0.14599609, + "step": 4640, + "time_per_iteration": 2.7929677963256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063545, + "balance_loss_mlp": 1.04906142, + "epoch": 0.8928434013081955, + "flos": 486669772800.0, + "grad_norm": 0.07327092814585671, + "language_loss": 0.8064788, + "learning_rate": 2.981129694909146e-05, + "loss": 0.81711423, + "num_input_tokens_seen": 384251552, + "router_z_loss_mlp": 0.14465332, + "step": 4641, + "time_per_iteration": 2.547323226928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008945, + "balance_loss_mlp": 1.00260293, + "epoch": 0.893035782993459, + "flos": 1448302560768.0, + "grad_norm": 0.006067492083133456, + "language_loss": 0.80330861, + "learning_rate": 2.970542246376118e-05, + "loss": 0.813398, + "num_input_tokens_seen": 384472176, + "router_z_loss_mlp": 0.06347656, + "step": 4642, + "time_per_iteration": 4.66918683052063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_mlp": 1.0525589, + "epoch": 0.8932281646787226, + "flos": 611320255488.0, + "grad_norm": 0.08930531344509365, + "language_loss": 0.806961, + "learning_rate": 2.95997305629786e-05, + "loss": 0.8176316, + "num_input_tokens_seen": 384544224, + "router_z_loss_mlp": 0.14489746, + "step": 4643, + "time_per_iteration": 2.7685227394104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069165, + "balance_loss_mlp": 1.05447841, + "epoch": 0.8934205463639862, + "flos": 565760775168.0, + "grad_norm": 0.07686935082063327, + "language_loss": 0.84716517, + "learning_rate": 2.9494221287776957e-05, + "loss": 0.85785675, + "num_input_tokens_seen": 384611728, + "router_z_loss_mlp": 0.14660645, + "step": 4644, + "time_per_iteration": 2.671163320541382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071165, + "balance_loss_mlp": 1.05693138, + "epoch": 0.8936129280492497, + "flos": 488431420416.0, + "grad_norm": 0.07237412632406065, + "language_loss": 0.77936012, + "learning_rate": 2.9388894679118484e-05, + "loss": 0.79007179, + "num_input_tokens_seen": 384678048, + "router_z_loss_mlp": 0.14245605, + "step": 4645, + "time_per_iteration": 2.603137731552124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068007, + "balance_loss_mlp": 1.05332017, + "epoch": 0.8938053097345132, + "flos": 886490542080.0, + "grad_norm": 0.07522548933952772, + "language_loss": 0.80461609, + "learning_rate": 2.9283750777894912e-05, + "loss": 0.81529617, + "num_input_tokens_seen": 384766768, + "router_z_loss_mlp": 0.14672852, + "step": 4646, + "time_per_iteration": 3.280094623565674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066064, + "balance_loss_mlp": 1.05165184, + "epoch": 0.8939976914197768, + "flos": 593285658624.0, + "grad_norm": 0.08219366473251573, + "language_loss": 0.83988786, + "learning_rate": 2.9178789624926427e-05, + "loss": 0.85054851, + "num_input_tokens_seen": 384842352, + "router_z_loss_mlp": 0.14404297, + "step": 4647, + "time_per_iteration": 2.7825815677642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065827, + "balance_loss_mlp": 1.05093789, + "epoch": 0.8941900731050404, + "flos": 523247745024.0, + "grad_norm": 0.07496651199822538, + "language_loss": 0.81140471, + "learning_rate": 2.9074011260962706e-05, + "loss": 0.82206297, + "num_input_tokens_seen": 384912048, + "router_z_loss_mlp": 0.14855957, + "step": 4648, + "time_per_iteration": 2.63409686088562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063595, + "balance_loss_mlp": 1.04921877, + "epoch": 0.894382454790304, + "flos": 800582745600.0, + "grad_norm": 0.06368180077002604, + "language_loss": 0.81087399, + "learning_rate": 2.8969415726682158e-05, + "loss": 0.82150996, + "num_input_tokens_seen": 384986560, + "router_z_loss_mlp": 0.14367676, + "step": 4649, + "time_per_iteration": 3.0075058937072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065601, + "balance_loss_mlp": 1.05089021, + "epoch": 0.8945748364755676, + "flos": 479037307392.0, + "grad_norm": 0.075834985070335, + "language_loss": 0.84640402, + "learning_rate": 2.8865003062692517e-05, + "loss": 0.85705996, + "num_input_tokens_seen": 385057376, + "router_z_loss_mlp": 0.14697266, + "step": 4650, + "time_per_iteration": 2.6034011840820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106995, + "balance_loss_mlp": 1.0558238, + "epoch": 0.894767218160831, + "flos": 508776090624.0, + "grad_norm": 0.07759752501155412, + "language_loss": 0.83263576, + "learning_rate": 2.876077330953042e-05, + "loss": 0.84333521, + "num_input_tokens_seen": 385130880, + "router_z_loss_mlp": 0.14123535, + "step": 4651, + "time_per_iteration": 2.705700635910034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062492, + "balance_loss_mlp": 1.04778171, + "epoch": 0.8949595998460946, + "flos": 685857549312.0, + "grad_norm": 0.07437002469966474, + "language_loss": 0.81665766, + "learning_rate": 2.8656726507661378e-05, + "loss": 0.82728255, + "num_input_tokens_seen": 385205808, + "router_z_loss_mlp": 0.14685059, + "step": 4652, + "time_per_iteration": 2.8570845127105713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063896, + "balance_loss_mlp": 1.04962647, + "epoch": 0.8951519815313582, + "flos": 799920520704.0, + "grad_norm": 0.0793410947413692, + "language_loss": 0.7713061, + "learning_rate": 2.855286269747981e-05, + "loss": 0.78194505, + "num_input_tokens_seen": 385283616, + "router_z_loss_mlp": 0.1427002, + "step": 4653, + "time_per_iteration": 3.003610610961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_mlp": 1.05177677, + "epoch": 0.8953443632166218, + "flos": 666740782080.0, + "grad_norm": 0.07417541500782357, + "language_loss": 0.86446422, + "learning_rate": 2.8449181919309398e-05, + "loss": 0.87512767, + "num_input_tokens_seen": 385357488, + "router_z_loss_mlp": 0.14550781, + "step": 4654, + "time_per_iteration": 2.797839403152466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074989, + "balance_loss_mlp": 1.0601716, + "epoch": 0.8955367449018854, + "flos": 644977686528.0, + "grad_norm": 0.07248317296811377, + "language_loss": 0.83017957, + "learning_rate": 2.8345684213402556e-05, + "loss": 0.84092951, + "num_input_tokens_seen": 385431280, + "router_z_loss_mlp": 0.14819336, + "step": 4655, + "time_per_iteration": 2.8501458168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069, + "balance_loss_mlp": 1.05457556, + "epoch": 0.8957291265871489, + "flos": 808714077696.0, + "grad_norm": 0.08493016666071657, + "language_loss": 0.77622211, + "learning_rate": 2.8242369619940644e-05, + "loss": 0.78691208, + "num_input_tokens_seen": 385509840, + "router_z_loss_mlp": 0.14416504, + "step": 4656, + "time_per_iteration": 3.0544345378875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072691, + "balance_loss_mlp": 1.05780149, + "epoch": 0.8959215082724125, + "flos": 518923832832.0, + "grad_norm": 0.06895909454077155, + "language_loss": 0.77396119, + "learning_rate": 2.813923817903391e-05, + "loss": 0.78468812, + "num_input_tokens_seen": 385580384, + "router_z_loss_mlp": 0.14880371, + "step": 4657, + "time_per_iteration": 2.625452995300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062071, + "balance_loss_mlp": 1.04775393, + "epoch": 0.896113889957676, + "flos": 476917383168.0, + "grad_norm": 0.0682470588082175, + "language_loss": 0.770661, + "learning_rate": 2.8036289930721603e-05, + "loss": 0.78128171, + "num_input_tokens_seen": 385649184, + "router_z_loss_mlp": 0.14318848, + "step": 4658, + "time_per_iteration": 2.644498348236084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068233, + "balance_loss_mlp": 1.05376089, + "epoch": 0.8963062716429396, + "flos": 518162863104.0, + "grad_norm": 0.08115976152175236, + "language_loss": 0.83185726, + "learning_rate": 2.7933524914971697e-05, + "loss": 0.84253961, + "num_input_tokens_seen": 385717072, + "router_z_loss_mlp": 0.14453125, + "step": 4659, + "time_per_iteration": 2.6564247608184814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072802, + "balance_loss_mlp": 1.05815101, + "epoch": 0.8964986533282031, + "flos": 508484625408.0, + "grad_norm": 0.07390456564378446, + "language_loss": 0.81757265, + "learning_rate": 2.7830943171681113e-05, + "loss": 0.82830071, + "num_input_tokens_seen": 385788880, + "router_z_loss_mlp": 0.1463623, + "step": 4660, + "time_per_iteration": 2.782780170440674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068951, + "balance_loss_mlp": 1.05434823, + "epoch": 0.8966910350134667, + "flos": 536076320256.0, + "grad_norm": 0.08706141368322202, + "language_loss": 0.81532872, + "learning_rate": 2.77285447406756e-05, + "loss": 0.82601821, + "num_input_tokens_seen": 385854240, + "router_z_loss_mlp": 0.14575195, + "step": 4661, + "time_per_iteration": 2.6224963665008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069438, + "balance_loss_mlp": 1.05518055, + "epoch": 0.8968834166987303, + "flos": 723226226688.0, + "grad_norm": 0.08580676839131798, + "language_loss": 0.84018874, + "learning_rate": 2.7626329661709914e-05, + "loss": 0.85088313, + "num_input_tokens_seen": 385926080, + "router_z_loss_mlp": 0.14257812, + "step": 4662, + "time_per_iteration": 2.8570454120635986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_mlp": 1.05049145, + "epoch": 0.8970757983839939, + "flos": 681686710272.0, + "grad_norm": 0.060610777722161696, + "language_loss": 0.83853519, + "learning_rate": 2.7524297974467372e-05, + "loss": 0.84918392, + "num_input_tokens_seen": 386005696, + "router_z_loss_mlp": 0.14355469, + "step": 4663, + "time_per_iteration": 2.920295476913452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063037, + "balance_loss_mlp": 1.04879189, + "epoch": 0.8972681800692575, + "flos": 613037486592.0, + "grad_norm": 0.0958798817779249, + "language_loss": 0.75815916, + "learning_rate": 2.742244971856006e-05, + "loss": 0.76878953, + "num_input_tokens_seen": 386073248, + "router_z_loss_mlp": 0.14257812, + "step": 4664, + "time_per_iteration": 2.734210729598999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067954, + "balance_loss_mlp": 1.05330276, + "epoch": 0.8974605617545209, + "flos": 572350344192.0, + "grad_norm": 0.06671630695450127, + "language_loss": 0.83153635, + "learning_rate": 2.732078493352913e-05, + "loss": 0.8422159, + "num_input_tokens_seen": 386148528, + "router_z_loss_mlp": 0.14624023, + "step": 4665, + "time_per_iteration": 2.785287857055664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067553, + "balance_loss_mlp": 1.05297387, + "epoch": 0.8976529434397845, + "flos": 520418608128.0, + "grad_norm": 0.07070738128757356, + "language_loss": 0.87607473, + "learning_rate": 2.721930365884434e-05, + "loss": 0.88675022, + "num_input_tokens_seen": 386218528, + "router_z_loss_mlp": 0.14575195, + "step": 4666, + "time_per_iteration": 2.703206777572632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069921, + "balance_loss_mlp": 1.0556159, + "epoch": 0.8978453251250481, + "flos": 471355656192.0, + "grad_norm": 0.06387672744087973, + "language_loss": 0.82552743, + "learning_rate": 2.7118005933904176e-05, + "loss": 0.83622664, + "num_input_tokens_seen": 386284704, + "router_z_loss_mlp": 0.14282227, + "step": 4667, + "time_per_iteration": 2.6351258754730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065876, + "balance_loss_mlp": 1.05146372, + "epoch": 0.8980377068103117, + "flos": 591659831808.0, + "grad_norm": 0.05668942470772938, + "language_loss": 0.81973124, + "learning_rate": 2.7016891798035904e-05, + "loss": 0.83038998, + "num_input_tokens_seen": 386356128, + "router_z_loss_mlp": 0.14416504, + "step": 4668, + "time_per_iteration": 2.787976026535034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069796, + "balance_loss_mlp": 1.05552649, + "epoch": 0.8982300884955752, + "flos": 767619472896.0, + "grad_norm": 0.08326371614499664, + "language_loss": 0.82582569, + "learning_rate": 2.691596129049556e-05, + "loss": 0.83652365, + "num_input_tokens_seen": 386434048, + "router_z_loss_mlp": 0.1427002, + "step": 4669, + "time_per_iteration": 2.969316244125366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067174, + "balance_loss_mlp": 1.05295277, + "epoch": 0.8984224701808388, + "flos": 844575496704.0, + "grad_norm": 0.06732596240846979, + "language_loss": 0.77453232, + "learning_rate": 2.681521445046775e-05, + "loss": 0.78520411, + "num_input_tokens_seen": 386532384, + "router_z_loss_mlp": 0.14208984, + "step": 4670, + "time_per_iteration": 3.223684549331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065272, + "balance_loss_mlp": 1.0510509, + "epoch": 0.8986148518661023, + "flos": 757661879808.0, + "grad_norm": 0.07787101362548385, + "language_loss": 0.75908744, + "learning_rate": 2.6714651317065963e-05, + "loss": 0.76974022, + "num_input_tokens_seen": 386627120, + "router_z_loss_mlp": 0.14221191, + "step": 4671, + "time_per_iteration": 3.1525230407714844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063641, + "balance_loss_mlp": 1.04913378, + "epoch": 0.8988072335513659, + "flos": 563070030336.0, + "grad_norm": 0.07908163897949186, + "language_loss": 0.76794827, + "learning_rate": 2.6614271929332133e-05, + "loss": 0.77858472, + "num_input_tokens_seen": 386700192, + "router_z_loss_mlp": 0.14489746, + "step": 4672, + "time_per_iteration": 2.696798801422119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069201, + "balance_loss_mlp": 1.05497932, + "epoch": 0.8989996152366295, + "flos": 492683751936.0, + "grad_norm": 0.0696282834458535, + "language_loss": 0.86689526, + "learning_rate": 2.6514076326237147e-05, + "loss": 0.87758726, + "num_input_tokens_seen": 386764256, + "router_z_loss_mlp": 0.14208984, + "step": 4673, + "time_per_iteration": 2.608199119567871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069189, + "balance_loss_mlp": 1.05446625, + "epoch": 0.899191996921893, + "flos": 542567144448.0, + "grad_norm": 0.0815899420947763, + "language_loss": 0.75713086, + "learning_rate": 2.6414064546680438e-05, + "loss": 0.7678228, + "num_input_tokens_seen": 386835792, + "router_z_loss_mlp": 0.14697266, + "step": 4674, + "time_per_iteration": 2.6534104347229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063878, + "balance_loss_mlp": 1.04948986, + "epoch": 0.8993843786071566, + "flos": 471325920768.0, + "grad_norm": 0.09288990193845198, + "language_loss": 0.79658222, + "learning_rate": 2.631423662948984e-05, + "loss": 0.80722106, + "num_input_tokens_seen": 386904368, + "router_z_loss_mlp": 0.1439209, + "step": 4675, + "time_per_iteration": 2.5522913932800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066799, + "balance_loss_mlp": 1.05254185, + "epoch": 0.8995767602924202, + "flos": 526726623744.0, + "grad_norm": 0.07376998741861143, + "language_loss": 0.82278091, + "learning_rate": 2.621459261342196e-05, + "loss": 0.83344889, + "num_input_tokens_seen": 386977872, + "router_z_loss_mlp": 0.14245605, + "step": 4676, + "time_per_iteration": 2.744189739227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067895, + "balance_loss_mlp": 1.05380487, + "epoch": 0.8997691419776838, + "flos": 557634212352.0, + "grad_norm": 0.07732253278752255, + "language_loss": 0.84530777, + "learning_rate": 2.6115132537162245e-05, + "loss": 0.85598671, + "num_input_tokens_seen": 387052080, + "router_z_loss_mlp": 0.14099121, + "step": 4677, + "time_per_iteration": 2.687650203704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068556, + "balance_loss_mlp": 1.05438173, + "epoch": 0.8999615236629472, + "flos": 639027947520.0, + "grad_norm": 0.07249769601440123, + "language_loss": 0.80352259, + "learning_rate": 2.601585643932436e-05, + "loss": 0.81420815, + "num_input_tokens_seen": 387129712, + "router_z_loss_mlp": 0.1418457, + "step": 4678, + "time_per_iteration": 2.8250062465667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010174, + "balance_loss_mlp": 1.00383258, + "epoch": 0.9001539053482108, + "flos": 1431510547968.0, + "grad_norm": 0.0048517691519101465, + "language_loss": 0.85784018, + "learning_rate": 2.5916764358450862e-05, + "loss": 0.86794198, + "num_input_tokens_seen": 387356560, + "router_z_loss_mlp": 0.06347656, + "step": 4679, + "time_per_iteration": 4.8175883293151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069083, + "balance_loss_mlp": 1.05459857, + "epoch": 0.9003462870334744, + "flos": 566877450240.0, + "grad_norm": 0.07910377737039828, + "language_loss": 0.79715955, + "learning_rate": 2.5817856333012425e-05, + "loss": 0.80785036, + "num_input_tokens_seen": 387438640, + "router_z_loss_mlp": 0.14501953, + "step": 4680, + "time_per_iteration": 2.88101863861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066488, + "balance_loss_mlp": 1.05164611, + "epoch": 0.900538668718738, + "flos": 538655837184.0, + "grad_norm": 0.09077574273759434, + "language_loss": 0.784284, + "learning_rate": 2.5719132401408883e-05, + "loss": 0.79494882, + "num_input_tokens_seen": 387507088, + "router_z_loss_mlp": 0.14819336, + "step": 4681, + "time_per_iteration": 2.6530046463012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065401, + "balance_loss_mlp": 1.05086935, + "epoch": 0.9007310504040016, + "flos": 488387003904.0, + "grad_norm": 0.09481128495773143, + "language_loss": 0.85749257, + "learning_rate": 2.5620592601968028e-05, + "loss": 0.86814654, + "num_input_tokens_seen": 387574160, + "router_z_loss_mlp": 0.14526367, + "step": 4682, + "time_per_iteration": 2.5924911499023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069464, + "balance_loss_mlp": 1.05537307, + "epoch": 0.9009234320892651, + "flos": 652901617152.0, + "grad_norm": 0.07242356929765614, + "language_loss": 0.78751016, + "learning_rate": 2.5522236972946532e-05, + "loss": 0.79820478, + "num_input_tokens_seen": 387652528, + "router_z_loss_mlp": 0.14086914, + "step": 4683, + "time_per_iteration": 2.8851335048675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069143, + "balance_loss_mlp": 1.05439687, + "epoch": 0.9011158137745287, + "flos": 545569178112.0, + "grad_norm": 0.05948760024031309, + "language_loss": 0.85193956, + "learning_rate": 2.5424065552529295e-05, + "loss": 0.86263096, + "num_input_tokens_seen": 387723520, + "router_z_loss_mlp": 0.1472168, + "step": 4684, + "time_per_iteration": 2.6724307537078857 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060384, + "balance_loss_mlp": 1.04620993, + "epoch": 0.9013081954597922, + "flos": 559699808256.0, + "grad_norm": 0.06866797250942824, + "language_loss": 0.82781953, + "learning_rate": 2.532607837883011e-05, + "loss": 0.83842337, + "num_input_tokens_seen": 387793664, + "router_z_loss_mlp": 0.14160156, + "step": 4685, + "time_per_iteration": 2.739715099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065935, + "balance_loss_mlp": 1.05133152, + "epoch": 0.9015005771450558, + "flos": 728652132864.0, + "grad_norm": 0.06366441246454302, + "language_loss": 0.81115925, + "learning_rate": 2.5228275489890706e-05, + "loss": 0.82181865, + "num_input_tokens_seen": 387871008, + "router_z_loss_mlp": 0.14599609, + "step": 4686, + "time_per_iteration": 2.9184703826904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065827, + "balance_loss_mlp": 1.05153358, + "epoch": 0.9016929588303193, + "flos": 517416574464.0, + "grad_norm": 0.07207113969320707, + "language_loss": 0.81100535, + "learning_rate": 2.5130656923681605e-05, + "loss": 0.82166362, + "num_input_tokens_seen": 387950832, + "router_z_loss_mlp": 0.14294434, + "step": 4687, + "time_per_iteration": 2.788123369216919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065738, + "balance_loss_mlp": 1.05131364, + "epoch": 0.9018853405155829, + "flos": 622335052800.0, + "grad_norm": 0.08582414251265837, + "language_loss": 0.85935158, + "learning_rate": 2.503322271810171e-05, + "loss": 0.87000895, + "num_input_tokens_seen": 388029792, + "router_z_loss_mlp": 0.14416504, + "step": 4688, + "time_per_iteration": 2.828031301498413 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065081, + "balance_loss_mlp": 1.05029953, + "epoch": 0.9020777222008465, + "flos": 523284820992.0, + "grad_norm": 0.07001634652504764, + "language_loss": 0.77557302, + "learning_rate": 2.4935972910978378e-05, + "loss": 0.78622389, + "num_input_tokens_seen": 388095872, + "router_z_loss_mlp": 0.14758301, + "step": 4689, + "time_per_iteration": 2.628683567047119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061017, + "balance_loss_mlp": 1.04687846, + "epoch": 0.9022701038861101, + "flos": 633713269248.0, + "grad_norm": 0.058017175212302395, + "language_loss": 0.81781268, + "learning_rate": 2.4838907540067346e-05, + "loss": 0.82842284, + "num_input_tokens_seen": 388171632, + "router_z_loss_mlp": 0.14135742, + "step": 4690, + "time_per_iteration": 2.9031572341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067164, + "balance_loss_mlp": 1.05271626, + "epoch": 0.9024624855713737, + "flos": 513295294464.0, + "grad_norm": 0.08913945678563855, + "language_loss": 0.84359717, + "learning_rate": 2.474202664305253e-05, + "loss": 0.85426879, + "num_input_tokens_seen": 388242240, + "router_z_loss_mlp": 0.14428711, + "step": 4691, + "time_per_iteration": 2.631625175476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065723, + "balance_loss_mlp": 1.05104828, + "epoch": 0.9026548672566371, + "flos": 477411480576.0, + "grad_norm": 0.07129062620509946, + "language_loss": 0.86470556, + "learning_rate": 2.464533025754673e-05, + "loss": 0.87536281, + "num_input_tokens_seen": 388310960, + "router_z_loss_mlp": 0.14660645, + "step": 4692, + "time_per_iteration": 2.6019630432128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106471, + "balance_loss_mlp": 1.05016685, + "epoch": 0.9028472489419007, + "flos": 661994353152.0, + "grad_norm": 0.06751433546030572, + "language_loss": 0.73846859, + "learning_rate": 2.454881842109058e-05, + "loss": 0.74911571, + "num_input_tokens_seen": 388387280, + "router_z_loss_mlp": 0.14526367, + "step": 4693, + "time_per_iteration": 2.8628439903259277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070243, + "balance_loss_mlp": 1.05603313, + "epoch": 0.9030396306271643, + "flos": 534588885504.0, + "grad_norm": 0.08037251387372839, + "language_loss": 0.8173911, + "learning_rate": 2.4452491171153445e-05, + "loss": 0.82809353, + "num_input_tokens_seen": 388456992, + "router_z_loss_mlp": 0.14208984, + "step": 4694, + "time_per_iteration": 2.6520001888275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071097, + "balance_loss_mlp": 1.05644655, + "epoch": 0.9032320123124279, + "flos": 801032426496.0, + "grad_norm": 0.06679419876391568, + "language_loss": 0.82209843, + "learning_rate": 2.43563485451328e-05, + "loss": 0.83280945, + "num_input_tokens_seen": 388534896, + "router_z_loss_mlp": 0.1463623, + "step": 4695, + "time_per_iteration": 2.9608535766601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064857, + "balance_loss_mlp": 1.05057585, + "epoch": 0.9034243939976914, + "flos": 553942789632.0, + "grad_norm": 0.07597748823489225, + "language_loss": 0.76713479, + "learning_rate": 2.426039058035451e-05, + "loss": 0.77778327, + "num_input_tokens_seen": 388606640, + "router_z_loss_mlp": 0.14294434, + "step": 4696, + "time_per_iteration": 2.6596148014068604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065816, + "balance_loss_mlp": 1.05115342, + "epoch": 0.903616775682955, + "flos": 503903752704.0, + "grad_norm": 0.08113286673515856, + "language_loss": 0.82694674, + "learning_rate": 2.4164617314072823e-05, + "loss": 0.83760482, + "num_input_tokens_seen": 388675920, + "router_z_loss_mlp": 0.1463623, + "step": 4697, + "time_per_iteration": 2.583207845687866 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066346, + "balance_loss_mlp": 1.05205238, + "epoch": 0.9038091573682185, + "flos": 436297052160.0, + "grad_norm": 0.06372833149732707, + "language_loss": 0.78702718, + "learning_rate": 2.406902878347017e-05, + "loss": 0.79769063, + "num_input_tokens_seen": 388743968, + "router_z_loss_mlp": 0.14294434, + "step": 4698, + "time_per_iteration": 2.6969242095947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068087, + "balance_loss_mlp": 1.0532459, + "epoch": 0.9040015390534821, + "flos": 532916070912.0, + "grad_norm": 0.10371766305830984, + "language_loss": 0.81603229, + "learning_rate": 2.3973625025657253e-05, + "loss": 0.8267132, + "num_input_tokens_seen": 388810784, + "router_z_loss_mlp": 0.14807129, + "step": 4699, + "time_per_iteration": 2.715841054916382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_mlp": 1.05056787, + "epoch": 0.9041939207387457, + "flos": 564307845120.0, + "grad_norm": 0.07382898909247483, + "language_loss": 0.80341852, + "learning_rate": 2.3878406077673275e-05, + "loss": 0.81406426, + "num_input_tokens_seen": 388885072, + "router_z_loss_mlp": 0.14025879, + "step": 4700, + "time_per_iteration": 2.777735948562622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066468, + "balance_loss_mlp": 1.05236554, + "epoch": 0.9043863024240092, + "flos": 515509194240.0, + "grad_norm": 0.08219031105158194, + "language_loss": 0.77413332, + "learning_rate": 2.3783371976485447e-05, + "loss": 0.78479803, + "num_input_tokens_seen": 388951184, + "router_z_loss_mlp": 0.14099121, + "step": 4701, + "time_per_iteration": 2.5898244380950928 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01006479, + "balance_loss_mlp": 1.00013745, + "epoch": 0.9045786841092728, + "flos": 1277949063168.0, + "grad_norm": 0.004346431205568835, + "language_loss": 0.72929788, + "learning_rate": 2.368852275898914e-05, + "loss": 0.73936266, + "num_input_tokens_seen": 389170752, + "router_z_loss_mlp": 0.06347656, + "step": 4702, + "time_per_iteration": 4.976499557495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069366, + "balance_loss_mlp": 1.05514455, + "epoch": 0.9047710657945364, + "flos": 585841144320.0, + "grad_norm": 0.09422512438722834, + "language_loss": 0.82765079, + "learning_rate": 2.3593858462008178e-05, + "loss": 0.83834445, + "num_input_tokens_seen": 389239600, + "router_z_loss_mlp": 0.14221191, + "step": 4703, + "time_per_iteration": 2.694584369659424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059188, + "balance_loss_mlp": 1.04433465, + "epoch": 0.9049634474798, + "flos": 571937739264.0, + "grad_norm": 0.07632132542953685, + "language_loss": 0.79691899, + "learning_rate": 2.3499379122294495e-05, + "loss": 0.80751085, + "num_input_tokens_seen": 389316032, + "router_z_loss_mlp": 0.14831543, + "step": 4704, + "time_per_iteration": 2.7374324798583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065528, + "balance_loss_mlp": 1.05164027, + "epoch": 0.9051558291650635, + "flos": 572619787776.0, + "grad_norm": 0.09869454063835598, + "language_loss": 0.74012047, + "learning_rate": 2.3405084776528307e-05, + "loss": 0.75077575, + "num_input_tokens_seen": 389383504, + "router_z_loss_mlp": 0.13916016, + "step": 4705, + "time_per_iteration": 2.6923489570617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061143, + "balance_loss_mlp": 1.04684973, + "epoch": 0.905348210850327, + "flos": 540538624512.0, + "grad_norm": 0.07680020480338727, + "language_loss": 0.7908138, + "learning_rate": 2.331097546131783e-05, + "loss": 0.80142522, + "num_input_tokens_seen": 389454592, + "router_z_loss_mlp": 0.14282227, + "step": 4706, + "time_per_iteration": 2.7089977264404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064406, + "balance_loss_mlp": 1.04993391, + "epoch": 0.9055405925355906, + "flos": 516381391872.0, + "grad_norm": 0.06962007853797926, + "language_loss": 0.81341648, + "learning_rate": 2.321705121319956e-05, + "loss": 0.82406056, + "num_input_tokens_seen": 389519696, + "router_z_loss_mlp": 0.14453125, + "step": 4707, + "time_per_iteration": 2.5984983444213867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060615, + "balance_loss_mlp": 1.04610777, + "epoch": 0.9057329742208542, + "flos": 914643145728.0, + "grad_norm": 0.07927977683650914, + "language_loss": 0.84686792, + "learning_rate": 2.3123312068638104e-05, + "loss": 0.85747409, + "num_input_tokens_seen": 389603568, + "router_z_loss_mlp": 0.14489746, + "step": 4708, + "time_per_iteration": 3.2090601921081543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061343, + "balance_loss_mlp": 1.04700196, + "epoch": 0.9059253559061178, + "flos": 905261515776.0, + "grad_norm": 0.07156146868519836, + "language_loss": 0.82681596, + "learning_rate": 2.3029758064026295e-05, + "loss": 0.8374294, + "num_input_tokens_seen": 389687504, + "router_z_loss_mlp": 0.14343262, + "step": 4709, + "time_per_iteration": 3.144296884536743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106027, + "balance_loss_mlp": 1.0454762, + "epoch": 0.9061177375913813, + "flos": 664534222848.0, + "grad_norm": 0.08456755450769048, + "language_loss": 0.77485931, + "learning_rate": 2.2936389235684918e-05, + "loss": 0.78546202, + "num_input_tokens_seen": 389764880, + "router_z_loss_mlp": 0.14770508, + "step": 4710, + "time_per_iteration": 2.905553102493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064119, + "balance_loss_mlp": 1.04987335, + "epoch": 0.9063101192766448, + "flos": 565609900032.0, + "grad_norm": 0.0740076583450613, + "language_loss": 0.82648456, + "learning_rate": 2.2843205619862972e-05, + "loss": 0.83712578, + "num_input_tokens_seen": 389838304, + "router_z_loss_mlp": 0.14233398, + "step": 4711, + "time_per_iteration": 2.807823419570923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065313, + "balance_loss_mlp": 1.05088818, + "epoch": 0.9065025009619084, + "flos": 727377242112.0, + "grad_norm": 0.07095500667918284, + "language_loss": 0.79161823, + "learning_rate": 2.2750207252737742e-05, + "loss": 0.80227137, + "num_input_tokens_seen": 389908592, + "router_z_loss_mlp": 0.14404297, + "step": 4712, + "time_per_iteration": 2.9008591175079346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059623, + "balance_loss_mlp": 1.0455091, + "epoch": 0.906694882647172, + "flos": 531512699904.0, + "grad_norm": 0.07208403118475668, + "language_loss": 0.79940331, + "learning_rate": 2.265739417041418e-05, + "loss": 0.80999959, + "num_input_tokens_seen": 389979040, + "router_z_loss_mlp": 0.14111328, + "step": 4713, + "time_per_iteration": 2.630858898162842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063209, + "balance_loss_mlp": 1.04830837, + "epoch": 0.9068872643324356, + "flos": 429788975616.0, + "grad_norm": 0.08721632216250842, + "language_loss": 0.846187, + "learning_rate": 2.2564766408925574e-05, + "loss": 0.85681909, + "num_input_tokens_seen": 390046080, + "router_z_loss_mlp": 0.14892578, + "step": 4714, + "time_per_iteration": 2.588636636734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069052, + "balance_loss_mlp": 1.05467498, + "epoch": 0.9070796460176991, + "flos": 588366332928.0, + "grad_norm": 0.07267525765549768, + "language_loss": 0.79764116, + "learning_rate": 2.2472324004233214e-05, + "loss": 0.80833167, + "num_input_tokens_seen": 390122176, + "router_z_loss_mlp": 0.14367676, + "step": 4715, + "time_per_iteration": 2.753297805786133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065162, + "balance_loss_mlp": 1.05052352, + "epoch": 0.9072720277029627, + "flos": 571582033920.0, + "grad_norm": 0.13233768252009984, + "language_loss": 0.75550985, + "learning_rate": 2.2380066992226446e-05, + "loss": 0.76616144, + "num_input_tokens_seen": 390195216, + "router_z_loss_mlp": 0.14611816, + "step": 4716, + "time_per_iteration": 2.7039754390716553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062213, + "balance_loss_mlp": 1.04836059, + "epoch": 0.9074644093882263, + "flos": 555798412800.0, + "grad_norm": 0.0678711634408688, + "language_loss": 0.88607067, + "learning_rate": 2.2287995408722617e-05, + "loss": 0.89669281, + "num_input_tokens_seen": 390263216, + "router_z_loss_mlp": 0.13867188, + "step": 4717, + "time_per_iteration": 2.653090476989746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106287, + "balance_loss_mlp": 1.04805207, + "epoch": 0.9076567910734898, + "flos": 640994798592.0, + "grad_norm": 0.0708629582790745, + "language_loss": 0.82607627, + "learning_rate": 2.2196109289467083e-05, + "loss": 0.83670497, + "num_input_tokens_seen": 390337360, + "router_z_loss_mlp": 0.14794922, + "step": 4718, + "time_per_iteration": 2.7904906272888184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067779, + "balance_loss_mlp": 1.05324757, + "epoch": 0.9078491727587533, + "flos": 733998744576.0, + "grad_norm": 0.06597864576502437, + "language_loss": 0.8173753, + "learning_rate": 2.2104408670133193e-05, + "loss": 0.82805312, + "num_input_tokens_seen": 390427728, + "router_z_loss_mlp": 0.1451416, + "step": 4719, + "time_per_iteration": 3.107698678970337 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062712, + "balance_loss_mlp": 1.04819226, + "epoch": 0.9080415544440169, + "flos": 654774492672.0, + "grad_norm": 0.06124132734061613, + "language_loss": 0.86642504, + "learning_rate": 2.2012893586322245e-05, + "loss": 0.87705219, + "num_input_tokens_seen": 390504736, + "router_z_loss_mlp": 0.14489746, + "step": 4720, + "time_per_iteration": 2.853066921234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061154, + "balance_loss_mlp": 1.0462532, + "epoch": 0.9082339361292805, + "flos": 597463838208.0, + "grad_norm": 0.06958086919390859, + "language_loss": 0.79430765, + "learning_rate": 2.1921564073563604e-05, + "loss": 0.80491918, + "num_input_tokens_seen": 390582048, + "router_z_loss_mlp": 0.14892578, + "step": 4721, + "time_per_iteration": 2.7562382221221924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_mlp": 1.05006683, + "epoch": 0.9084263178145441, + "flos": 504407761920.0, + "grad_norm": 0.07783183488641256, + "language_loss": 0.84554178, + "learning_rate": 2.183042016731457e-05, + "loss": 0.85618752, + "num_input_tokens_seen": 390652976, + "router_z_loss_mlp": 0.14489746, + "step": 4722, + "time_per_iteration": 2.6053574085235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063826, + "balance_loss_mlp": 1.04959297, + "epoch": 0.9086186994998077, + "flos": 550031482368.0, + "grad_norm": 0.0692661418085719, + "language_loss": 0.80308425, + "learning_rate": 2.1739461902960223e-05, + "loss": 0.81372249, + "num_input_tokens_seen": 390726832, + "router_z_loss_mlp": 0.14221191, + "step": 4723, + "time_per_iteration": 2.726078748703003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066428, + "balance_loss_mlp": 1.05238521, + "epoch": 0.9088110811850711, + "flos": 1134076847616.0, + "grad_norm": 0.07360851759863624, + "language_loss": 0.75023186, + "learning_rate": 2.1648689315813763e-05, + "loss": 0.76089615, + "num_input_tokens_seen": 390824480, + "router_z_loss_mlp": 0.14025879, + "step": 4724, + "time_per_iteration": 3.610614776611328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066842, + "balance_loss_mlp": 1.05257237, + "epoch": 0.9090034628703347, + "flos": 556991811072.0, + "grad_norm": 0.06736800073919637, + "language_loss": 0.76777983, + "learning_rate": 2.155810244111628e-05, + "loss": 0.77844834, + "num_input_tokens_seen": 390897552, + "router_z_loss_mlp": 0.1427002, + "step": 4725, + "time_per_iteration": 2.7194221019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066103, + "balance_loss_mlp": 1.05192947, + "epoch": 0.9091958445555983, + "flos": 543970515456.0, + "grad_norm": 0.06425275032512914, + "language_loss": 0.84323931, + "learning_rate": 2.146770131403658e-05, + "loss": 0.85390031, + "num_input_tokens_seen": 390969008, + "router_z_loss_mlp": 0.1418457, + "step": 4726, + "time_per_iteration": 2.70613169670105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068032, + "balance_loss_mlp": 1.0533216, + "epoch": 0.9093882262408619, + "flos": 526113957888.0, + "grad_norm": 0.0798223233721421, + "language_loss": 0.80948919, + "learning_rate": 2.1377485969671594e-05, + "loss": 0.82016957, + "num_input_tokens_seen": 391038880, + "router_z_loss_mlp": 0.14697266, + "step": 4727, + "time_per_iteration": 2.6258621215820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067441, + "balance_loss_mlp": 1.05299282, + "epoch": 0.9095806079261254, + "flos": 548526795264.0, + "grad_norm": 0.0886839073998551, + "language_loss": 0.819561, + "learning_rate": 2.1287456443046084e-05, + "loss": 0.83023536, + "num_input_tokens_seen": 391106720, + "router_z_loss_mlp": 0.14453125, + "step": 4728, + "time_per_iteration": 2.6196727752685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065639, + "balance_loss_mlp": 1.05156028, + "epoch": 0.909772989611389, + "flos": 572535724032.0, + "grad_norm": 0.0754217444502015, + "language_loss": 0.8483696, + "learning_rate": 2.1197612769112528e-05, + "loss": 0.85902596, + "num_input_tokens_seen": 391178128, + "router_z_loss_mlp": 0.14086914, + "step": 4729, + "time_per_iteration": 2.736675977706909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064015, + "balance_loss_mlp": 1.04972172, + "epoch": 0.9099653712966526, + "flos": 561812391936.0, + "grad_norm": 0.07023842602850215, + "language_loss": 0.79529822, + "learning_rate": 2.1107954982751254e-05, + "loss": 0.80593836, + "num_input_tokens_seen": 391248848, + "router_z_loss_mlp": 0.14294434, + "step": 4730, + "time_per_iteration": 2.682899236679077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062771, + "balance_loss_mlp": 1.04809618, + "epoch": 0.9101577529819161, + "flos": 1093800112128.0, + "grad_norm": 0.08043552719518131, + "language_loss": 0.79924774, + "learning_rate": 2.101848311877069e-05, + "loss": 0.80987543, + "num_input_tokens_seen": 391328000, + "router_z_loss_mlp": 0.14672852, + "step": 4731, + "time_per_iteration": 3.3790597915649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065728, + "balance_loss_mlp": 1.05133891, + "epoch": 0.9103501346671797, + "flos": 445444116480.0, + "grad_norm": 0.11616845268316883, + "language_loss": 0.81709516, + "learning_rate": 2.092919721190678e-05, + "loss": 0.82775241, + "num_input_tokens_seen": 391391616, + "router_z_loss_mlp": 0.14367676, + "step": 4732, + "time_per_iteration": 2.5050580501556396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069513, + "balance_loss_mlp": 1.0552671, + "epoch": 0.9105425163524432, + "flos": 500770667520.0, + "grad_norm": 0.08258993648041235, + "language_loss": 0.77471602, + "learning_rate": 2.0840097296823346e-05, + "loss": 0.78541112, + "num_input_tokens_seen": 391461312, + "router_z_loss_mlp": 0.14257812, + "step": 4733, + "time_per_iteration": 2.619145393371582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_mlp": 1.04723191, + "epoch": 0.9107348980377068, + "flos": 657519565824.0, + "grad_norm": 0.06290926072647557, + "language_loss": 0.84182942, + "learning_rate": 2.0751183408112162e-05, + "loss": 0.85244751, + "num_input_tokens_seen": 391542192, + "router_z_loss_mlp": 0.14562988, + "step": 4734, + "time_per_iteration": 2.8620665073394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067407, + "balance_loss_mlp": 1.05304253, + "epoch": 0.9109272797229704, + "flos": 553668576768.0, + "grad_norm": 0.06328957974022432, + "language_loss": 0.85179257, + "learning_rate": 2.066245558029256e-05, + "loss": 0.86246669, + "num_input_tokens_seen": 391609968, + "router_z_loss_mlp": 0.14367676, + "step": 4735, + "time_per_iteration": 2.6373870372772217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069052, + "balance_loss_mlp": 1.05477083, + "epoch": 0.911119661408234, + "flos": 519007896576.0, + "grad_norm": 0.08501781377109913, + "language_loss": 0.84289479, + "learning_rate": 2.057391384781182e-05, + "loss": 0.85358536, + "num_input_tokens_seen": 391681264, + "router_z_loss_mlp": 0.14282227, + "step": 4736, + "time_per_iteration": 2.6207847595214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066578, + "balance_loss_mlp": 1.05243933, + "epoch": 0.9113120430934974, + "flos": 554375218176.0, + "grad_norm": 0.07334243332410934, + "language_loss": 0.82774675, + "learning_rate": 2.0485558245044834e-05, + "loss": 0.83841252, + "num_input_tokens_seen": 391751392, + "router_z_loss_mlp": 0.14135742, + "step": 4737, + "time_per_iteration": 2.735215663909912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068558, + "balance_loss_mlp": 1.05442023, + "epoch": 0.911504424778761, + "flos": 501889913856.0, + "grad_norm": 0.08209276637430535, + "language_loss": 0.81173909, + "learning_rate": 2.0397388806294216e-05, + "loss": 0.82242465, + "num_input_tokens_seen": 391823952, + "router_z_loss_mlp": 0.14135742, + "step": 4738, + "time_per_iteration": 2.7108118534088135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065634, + "balance_loss_mlp": 1.05147171, + "epoch": 0.9116968064640246, + "flos": 611100370944.0, + "grad_norm": 0.06232841540702404, + "language_loss": 0.82146698, + "learning_rate": 2.0309405565790527e-05, + "loss": 0.83212328, + "num_input_tokens_seen": 391895264, + "router_z_loss_mlp": 0.14172363, + "step": 4739, + "time_per_iteration": 2.7313618659973145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065756, + "balance_loss_mlp": 1.0513792, + "epoch": 0.9118891881492882, + "flos": 572918593536.0, + "grad_norm": 0.0711794304309251, + "language_loss": 0.82400596, + "learning_rate": 2.0221608557691895e-05, + "loss": 0.83466357, + "num_input_tokens_seen": 391973040, + "router_z_loss_mlp": 0.14379883, + "step": 4740, + "time_per_iteration": 2.7886438369750977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068105, + "balance_loss_mlp": 1.05344248, + "epoch": 0.9120815698345518, + "flos": 635961673728.0, + "grad_norm": 0.07367219932429818, + "language_loss": 0.77975518, + "learning_rate": 2.0133997816083992e-05, + "loss": 0.79043615, + "num_input_tokens_seen": 392048160, + "router_z_loss_mlp": 0.14648438, + "step": 4741, + "time_per_iteration": 2.8221640586853027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064444, + "balance_loss_mlp": 1.04966211, + "epoch": 0.9122739515198153, + "flos": 702300824064.0, + "grad_norm": 0.09047893023933404, + "language_loss": 0.85824144, + "learning_rate": 2.0046573374980447e-05, + "loss": 0.86888587, + "num_input_tokens_seen": 392128960, + "router_z_loss_mlp": 0.14758301, + "step": 4742, + "time_per_iteration": 2.846888303756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066977, + "balance_loss_mlp": 1.0525769, + "epoch": 0.9124663332050789, + "flos": 524690763264.0, + "grad_norm": 0.08214101910930026, + "language_loss": 0.87773347, + "learning_rate": 1.995933526832239e-05, + "loss": 0.8884033, + "num_input_tokens_seen": 392195008, + "router_z_loss_mlp": 0.14404297, + "step": 4743, + "time_per_iteration": 2.61386775970459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106707, + "balance_loss_mlp": 1.05287266, + "epoch": 0.9126587148903424, + "flos": 563299826688.0, + "grad_norm": 0.07495266028674485, + "language_loss": 0.82313836, + "learning_rate": 1.9872283529978662e-05, + "loss": 0.83380902, + "num_input_tokens_seen": 392265168, + "router_z_loss_mlp": 0.14196777, + "step": 4744, + "time_per_iteration": 2.675384521484375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063023, + "balance_loss_mlp": 1.04872966, + "epoch": 0.912851096575606, + "flos": 505942184448.0, + "grad_norm": 0.08022890844288642, + "language_loss": 0.79648215, + "learning_rate": 1.978541819374574e-05, + "loss": 0.8071124, + "num_input_tokens_seen": 392329456, + "router_z_loss_mlp": 0.14294434, + "step": 4745, + "time_per_iteration": 2.599677562713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065707, + "balance_loss_mlp": 1.05115116, + "epoch": 0.9130434782608695, + "flos": 550730783232.0, + "grad_norm": 0.06424821919191918, + "language_loss": 0.82602614, + "learning_rate": 1.9698739293347755e-05, + "loss": 0.83668321, + "num_input_tokens_seen": 392397792, + "router_z_loss_mlp": 0.14550781, + "step": 4746, + "time_per_iteration": 2.6465249061584473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069413, + "balance_loss_mlp": 1.05548978, + "epoch": 0.9132358599461331, + "flos": 468976200192.0, + "grad_norm": 0.06465782208543523, + "language_loss": 0.83286655, + "learning_rate": 1.9612246862436456e-05, + "loss": 0.84356076, + "num_input_tokens_seen": 392462928, + "router_z_loss_mlp": 0.13952637, + "step": 4747, + "time_per_iteration": 2.554649829864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060732, + "balance_loss_mlp": 1.04640317, + "epoch": 0.9134282416313967, + "flos": 506097828864.0, + "grad_norm": 0.077833753798199, + "language_loss": 0.79933763, + "learning_rate": 1.9525940934591148e-05, + "loss": 0.80994493, + "num_input_tokens_seen": 392531840, + "router_z_loss_mlp": 0.14343262, + "step": 4748, + "time_per_iteration": 2.6685752868652344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067379, + "balance_loss_mlp": 1.05307388, + "epoch": 0.9136206233166603, + "flos": 604819519488.0, + "grad_norm": 0.07038608549494893, + "language_loss": 0.84113944, + "learning_rate": 1.9439821543318748e-05, + "loss": 0.8518132, + "num_input_tokens_seen": 392602464, + "router_z_loss_mlp": 0.14294434, + "step": 4749, + "time_per_iteration": 2.7418196201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067371, + "balance_loss_mlp": 1.05276763, + "epoch": 0.9138130050019239, + "flos": 561738240000.0, + "grad_norm": 0.07853084861256635, + "language_loss": 0.82891721, + "learning_rate": 1.9353888722053793e-05, + "loss": 0.83959091, + "num_input_tokens_seen": 392669872, + "router_z_loss_mlp": 0.14599609, + "step": 4750, + "time_per_iteration": 2.6632273197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_mlp": 1.05674314, + "epoch": 0.9140053866871873, + "flos": 690117221376.0, + "grad_norm": 0.20385434890647738, + "language_loss": 0.9033643, + "learning_rate": 1.9268142504158426e-05, + "loss": 0.91407716, + "num_input_tokens_seen": 392744256, + "router_z_loss_mlp": 0.14526367, + "step": 4751, + "time_per_iteration": 2.8812520503997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063694, + "balance_loss_mlp": 1.0491389, + "epoch": 0.9141977683724509, + "flos": 551012336640.0, + "grad_norm": 0.06632997403906014, + "language_loss": 0.84121269, + "learning_rate": 1.9182582922922186e-05, + "loss": 0.85184962, + "num_input_tokens_seen": 392816832, + "router_z_loss_mlp": 0.14550781, + "step": 4752, + "time_per_iteration": 2.742851495742798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067674, + "balance_loss_mlp": 1.05316639, + "epoch": 0.9143901500577145, + "flos": 540088943616.0, + "grad_norm": 0.0723562787374101, + "language_loss": 0.7560128, + "learning_rate": 1.9097210011562228e-05, + "loss": 0.76668954, + "num_input_tokens_seen": 392886304, + "router_z_loss_mlp": 0.14489746, + "step": 4753, + "time_per_iteration": 2.6533844470977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066571, + "balance_loss_mlp": 1.05187273, + "epoch": 0.9145825317429781, + "flos": 528767626752.0, + "grad_norm": 0.2806334555286775, + "language_loss": 0.80675328, + "learning_rate": 1.9012023803223366e-05, + "loss": 0.81741893, + "num_input_tokens_seen": 392955872, + "router_z_loss_mlp": 0.14672852, + "step": 4754, + "time_per_iteration": 2.632373094558716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073062, + "balance_loss_mlp": 1.05884004, + "epoch": 0.9147749134282416, + "flos": 514792641024.0, + "grad_norm": 0.06108923695610088, + "language_loss": 0.7887972, + "learning_rate": 1.892702433097776e-05, + "loss": 0.79952776, + "num_input_tokens_seen": 393025776, + "router_z_loss_mlp": 0.14208984, + "step": 4755, + "time_per_iteration": 2.648470640182495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067557, + "balance_loss_mlp": 1.05308533, + "epoch": 0.9149672951135052, + "flos": 514441704960.0, + "grad_norm": 0.07002276071565354, + "language_loss": 0.85469049, + "learning_rate": 1.8842211627825233e-05, + "loss": 0.8653661, + "num_input_tokens_seen": 393095936, + "router_z_loss_mlp": 0.14453125, + "step": 4756, + "time_per_iteration": 2.65950345993042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065171, + "balance_loss_mlp": 1.0507822, + "epoch": 0.9151596767987688, + "flos": 577069608960.0, + "grad_norm": 0.06465328138822253, + "language_loss": 0.812971, + "learning_rate": 1.8757585726692727e-05, + "loss": 0.8236227, + "num_input_tokens_seen": 393166816, + "router_z_loss_mlp": 0.1439209, + "step": 4757, + "time_per_iteration": 2.747659683227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068049, + "balance_loss_mlp": 1.05413735, + "epoch": 0.9153520584840323, + "flos": 619335590400.0, + "grad_norm": 0.07075275740266723, + "language_loss": 0.82515383, + "learning_rate": 1.8673146660435182e-05, + "loss": 0.83583432, + "num_input_tokens_seen": 393242176, + "router_z_loss_mlp": 0.13928223, + "step": 4758, + "time_per_iteration": 2.752819538116455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069271, + "balance_loss_mlp": 1.05499017, + "epoch": 0.9155444401692959, + "flos": 468921871872.0, + "grad_norm": 0.11778599796546448, + "language_loss": 0.82900631, + "learning_rate": 1.8588894461834704e-05, + "loss": 0.83969903, + "num_input_tokens_seen": 393311792, + "router_z_loss_mlp": 0.14294434, + "step": 4759, + "time_per_iteration": 2.6110918521881104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008192, + "balance_loss_mlp": 1.00194573, + "epoch": 0.9157368218545594, + "flos": 1410711054336.0, + "grad_norm": 0.006027089947377037, + "language_loss": 0.7481907, + "learning_rate": 1.8504829163600855e-05, + "loss": 0.75827265, + "num_input_tokens_seen": 393535648, + "router_z_loss_mlp": 0.0625, + "step": 4760, + "time_per_iteration": 4.842655420303345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008593, + "balance_loss_mlp": 1.00234604, + "epoch": 0.915929203539823, + "flos": 1522019040768.0, + "grad_norm": 0.006843317514305485, + "language_loss": 0.79576051, + "learning_rate": 1.8420950798370584e-05, + "loss": 0.80584645, + "num_input_tokens_seen": 393767040, + "router_z_loss_mlp": 0.0625, + "step": 4761, + "time_per_iteration": 4.96194052696228 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065486, + "balance_loss_mlp": 1.05128801, + "epoch": 0.9161215852250866, + "flos": 535752548352.0, + "grad_norm": 0.06584819530100704, + "language_loss": 0.80398101, + "learning_rate": 1.8337259398708616e-05, + "loss": 0.81463587, + "num_input_tokens_seen": 393841232, + "router_z_loss_mlp": 0.14221191, + "step": 4762, + "time_per_iteration": 2.758117437362671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066785, + "balance_loss_mlp": 1.05253971, + "epoch": 0.9163139669103502, + "flos": 590624649216.0, + "grad_norm": 0.07549229769112886, + "language_loss": 0.80455124, + "learning_rate": 1.8253754997106632e-05, + "loss": 0.81521916, + "num_input_tokens_seen": 393910512, + "router_z_loss_mlp": 0.14233398, + "step": 4763, + "time_per_iteration": 2.7597572803497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071711, + "balance_loss_mlp": 1.05723906, + "epoch": 0.9165063485956138, + "flos": 821975081472.0, + "grad_norm": 0.05989535024703023, + "language_loss": 0.84422004, + "learning_rate": 1.817043762598397e-05, + "loss": 0.8549372, + "num_input_tokens_seen": 393988624, + "router_z_loss_mlp": 0.14477539, + "step": 4764, + "time_per_iteration": 3.077842950820923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066851, + "balance_loss_mlp": 1.05241537, + "epoch": 0.9166987302808772, + "flos": 525194772480.0, + "grad_norm": 0.09553183117791494, + "language_loss": 0.8191523, + "learning_rate": 1.8087307317687264e-05, + "loss": 0.82982075, + "num_input_tokens_seen": 394059184, + "router_z_loss_mlp": 0.14428711, + "step": 4765, + "time_per_iteration": 2.644554376602173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069218, + "balance_loss_mlp": 1.05447185, + "epoch": 0.9168911119661408, + "flos": 655095693312.0, + "grad_norm": 0.06729500236914439, + "language_loss": 0.84236819, + "learning_rate": 1.800436410449058e-05, + "loss": 0.85306036, + "num_input_tokens_seen": 394142160, + "router_z_loss_mlp": 0.14733887, + "step": 4766, + "time_per_iteration": 2.9453341960906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074214, + "balance_loss_mlp": 1.05980158, + "epoch": 0.9170834936514044, + "flos": 491747314176.0, + "grad_norm": 0.06576753433024131, + "language_loss": 0.84860098, + "learning_rate": 1.7921608018595436e-05, + "loss": 0.85934317, + "num_input_tokens_seen": 394207056, + "router_z_loss_mlp": 0.14416504, + "step": 4767, + "time_per_iteration": 2.5406041145324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067338, + "balance_loss_mlp": 1.05275846, + "epoch": 0.917275875336668, + "flos": 628040314368.0, + "grad_norm": 0.09157964796114802, + "language_loss": 0.80223978, + "learning_rate": 1.7839039092130415e-05, + "loss": 0.81291318, + "num_input_tokens_seen": 394275456, + "router_z_loss_mlp": 0.14550781, + "step": 4768, + "time_per_iteration": 2.809734344482422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010723, + "balance_loss_mlp": 1.00447667, + "epoch": 0.9174682570219315, + "flos": 1517981824512.0, + "grad_norm": 0.007898028987614235, + "language_loss": 0.78180236, + "learning_rate": 1.7756657357151762e-05, + "loss": 0.79190958, + "num_input_tokens_seen": 394503808, + "router_z_loss_mlp": 0.0625, + "step": 4769, + "time_per_iteration": 4.9082324504852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066063, + "balance_loss_mlp": 1.05191278, + "epoch": 0.917660638707195, + "flos": 560021008896.0, + "grad_norm": 0.06184746471686271, + "language_loss": 0.84936714, + "learning_rate": 1.7674462845642835e-05, + "loss": 0.86002773, + "num_input_tokens_seen": 394573776, + "router_z_loss_mlp": 0.1418457, + "step": 4770, + "time_per_iteration": 2.679088830947876 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072014, + "balance_loss_mlp": 1.05756545, + "epoch": 0.9178530203924586, + "flos": 447252751872.0, + "grad_norm": 0.09313381459116095, + "language_loss": 0.83899945, + "learning_rate": 1.7592455589514387e-05, + "loss": 0.84971958, + "num_input_tokens_seen": 394637600, + "router_z_loss_mlp": 0.14440918, + "step": 4771, + "time_per_iteration": 2.519746780395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065875, + "balance_loss_mlp": 1.05180812, + "epoch": 0.9180454020777222, + "flos": 465981507072.0, + "grad_norm": 0.07078092470079442, + "language_loss": 0.8057059, + "learning_rate": 1.7510635620604453e-05, + "loss": 0.81636465, + "num_input_tokens_seen": 394707344, + "router_z_loss_mlp": 0.140625, + "step": 4772, + "time_per_iteration": 2.573575496673584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_mlp": 1.05537605, + "epoch": 0.9182377837629858, + "flos": 596314856448.0, + "grad_norm": 0.08298754862360035, + "language_loss": 0.87202299, + "learning_rate": 1.74290029706784e-05, + "loss": 0.88271654, + "num_input_tokens_seen": 394786368, + "router_z_loss_mlp": 0.13989258, + "step": 4773, + "time_per_iteration": 2.782898187637329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071717, + "balance_loss_mlp": 1.05753124, + "epoch": 0.9184301654482493, + "flos": 996671941632.0, + "grad_norm": 0.06677981987343952, + "language_loss": 0.82528126, + "learning_rate": 1.734755767142876e-05, + "loss": 0.83599842, + "num_input_tokens_seen": 394876976, + "router_z_loss_mlp": 0.14196777, + "step": 4774, + "time_per_iteration": 3.3350989818573 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069723, + "balance_loss_mlp": 1.05540562, + "epoch": 0.9186225471335129, + "flos": 508860154368.0, + "grad_norm": 0.07200425768913102, + "language_loss": 0.84860492, + "learning_rate": 1.7266299754475467e-05, + "loss": 0.85930216, + "num_input_tokens_seen": 394949024, + "router_z_loss_mlp": 0.14306641, + "step": 4775, + "time_per_iteration": 2.7125747203826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067689, + "balance_loss_mlp": 1.05345559, + "epoch": 0.9188149288187765, + "flos": 940423633920.0, + "grad_norm": 0.07577615196138396, + "language_loss": 0.78980851, + "learning_rate": 1.718522925136551e-05, + "loss": 0.80048543, + "num_input_tokens_seen": 395044352, + "router_z_loss_mlp": 0.14245605, + "step": 4776, + "time_per_iteration": 3.3351941108703613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065369, + "balance_loss_mlp": 1.05136228, + "epoch": 0.91900731050404, + "flos": 583674232320.0, + "grad_norm": 0.08146197777200662, + "language_loss": 0.83863878, + "learning_rate": 1.7104346193573484e-05, + "loss": 0.84929252, + "num_input_tokens_seen": 395113824, + "router_z_loss_mlp": 0.14013672, + "step": 4777, + "time_per_iteration": 2.672926664352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067592, + "balance_loss_mlp": 1.05339432, + "epoch": 0.9191996921893035, + "flos": 581213283840.0, + "grad_norm": 0.08031024809047536, + "language_loss": 0.79444981, + "learning_rate": 1.7023650612500828e-05, + "loss": 0.80512571, + "num_input_tokens_seen": 395184496, + "router_z_loss_mlp": 0.14196777, + "step": 4778, + "time_per_iteration": 2.6956064701080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065323, + "balance_loss_mlp": 1.05133939, + "epoch": 0.9193920738745671, + "flos": 908935686144.0, + "grad_norm": 0.0795713014857256, + "language_loss": 0.79998899, + "learning_rate": 1.6943142539476374e-05, + "loss": 0.81064218, + "num_input_tokens_seen": 395263760, + "router_z_loss_mlp": 0.13989258, + "step": 4779, + "time_per_iteration": 3.103442430496216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010724, + "balance_loss_mlp": 1.00447774, + "epoch": 0.9195844555598307, + "flos": 1558372359168.0, + "grad_norm": 0.00788177819914121, + "language_loss": 0.79795396, + "learning_rate": 1.686282200575606e-05, + "loss": 0.80806112, + "num_input_tokens_seen": 395482384, + "router_z_loss_mlp": 0.0625, + "step": 4780, + "time_per_iteration": 4.735037326812744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065599, + "balance_loss_mlp": 1.05109096, + "epoch": 0.9197768372450943, + "flos": 474053741568.0, + "grad_norm": 0.07512893938513913, + "language_loss": 0.78746933, + "learning_rate": 1.678268904252317e-05, + "loss": 0.79812533, + "num_input_tokens_seen": 395550384, + "router_z_loss_mlp": 0.14489746, + "step": 4781, + "time_per_iteration": 2.600867748260498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069019, + "balance_loss_mlp": 1.05447555, + "epoch": 0.9199692189303579, + "flos": 857016059904.0, + "grad_norm": 0.07162373169209806, + "language_loss": 0.84339678, + "learning_rate": 1.6702743680888088e-05, + "loss": 0.85408694, + "num_input_tokens_seen": 395632320, + "router_z_loss_mlp": 0.14526367, + "step": 4782, + "time_per_iteration": 3.215178966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070786, + "balance_loss_mlp": 1.05633759, + "epoch": 0.9201616006156214, + "flos": 504390509568.0, + "grad_norm": 0.08066982775893859, + "language_loss": 0.77412266, + "learning_rate": 1.6622985951888327e-05, + "loss": 0.78483045, + "num_input_tokens_seen": 395703856, + "router_z_loss_mlp": 0.14428711, + "step": 4783, + "time_per_iteration": 2.6220128536224365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065795, + "balance_loss_mlp": 1.05112016, + "epoch": 0.9203539823008849, + "flos": 548781184512.0, + "grad_norm": 0.07094596583832717, + "language_loss": 0.84888017, + "learning_rate": 1.6543415886488554e-05, + "loss": 0.85953808, + "num_input_tokens_seen": 395779456, + "router_z_loss_mlp": 0.14660645, + "step": 4784, + "time_per_iteration": 2.7147135734558105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_mlp": 1.04949427, + "epoch": 0.9205463639861485, + "flos": 540004879872.0, + "grad_norm": 0.0697955041806186, + "language_loss": 0.8231988, + "learning_rate": 1.6464033515580624e-05, + "loss": 0.83383662, + "num_input_tokens_seen": 395849584, + "router_z_loss_mlp": 0.14294434, + "step": 4785, + "time_per_iteration": 2.6527657508850098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065383, + "balance_loss_mlp": 1.05107796, + "epoch": 0.9207387456714121, + "flos": 799725229056.0, + "grad_norm": 0.07376799317416433, + "language_loss": 0.78239089, + "learning_rate": 1.6384838869983488e-05, + "loss": 0.79304475, + "num_input_tokens_seen": 395943712, + "router_z_loss_mlp": 0.14306641, + "step": 4786, + "time_per_iteration": 3.0446088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068332, + "balance_loss_mlp": 1.05381203, + "epoch": 0.9209311273566756, + "flos": 502848746496.0, + "grad_norm": 0.07061124245304527, + "language_loss": 0.78827488, + "learning_rate": 1.630583198044333e-05, + "loss": 0.79895824, + "num_input_tokens_seen": 396013168, + "router_z_loss_mlp": 0.14501953, + "step": 4787, + "time_per_iteration": 2.6726601123809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069793, + "balance_loss_mlp": 1.05524909, + "epoch": 0.9211235090419392, + "flos": 569323717632.0, + "grad_norm": 0.07225837689316757, + "language_loss": 0.82407451, + "learning_rate": 1.6227012877633173e-05, + "loss": 0.83477247, + "num_input_tokens_seen": 396082032, + "router_z_loss_mlp": 0.1451416, + "step": 4788, + "time_per_iteration": 2.759333372116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070256, + "balance_loss_mlp": 1.05587983, + "epoch": 0.9213158907272028, + "flos": 806549736960.0, + "grad_norm": 0.07835981374467402, + "language_loss": 0.8217482, + "learning_rate": 1.6148381592153538e-05, + "loss": 0.83245075, + "num_input_tokens_seen": 396157984, + "router_z_loss_mlp": 0.14379883, + "step": 4789, + "time_per_iteration": 3.032362937927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064491, + "balance_loss_mlp": 1.04975629, + "epoch": 0.9215082724124664, + "flos": 490682396160.0, + "grad_norm": 0.07372153379285619, + "language_loss": 0.76175332, + "learning_rate": 1.6069938154531618e-05, + "loss": 0.77239823, + "num_input_tokens_seen": 396223840, + "router_z_loss_mlp": 0.14697266, + "step": 4790, + "time_per_iteration": 2.564042806625366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01011897, + "balance_loss_mlp": 1.00565076, + "epoch": 0.9217006540977299, + "flos": 1514495232000.0, + "grad_norm": 0.008243719143982569, + "language_loss": 0.77070266, + "learning_rate": 1.599168259522188e-05, + "loss": 0.78082162, + "num_input_tokens_seen": 396458288, + "router_z_loss_mlp": 0.0625, + "step": 4791, + "time_per_iteration": 4.962024211883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107205, + "balance_loss_mlp": 1.05747056, + "epoch": 0.9218930357829934, + "flos": 743793352704.0, + "grad_norm": 0.06990923251195422, + "language_loss": 0.76191884, + "learning_rate": 1.5913614944605804e-05, + "loss": 0.77263939, + "num_input_tokens_seen": 396536208, + "router_z_loss_mlp": 0.14562988, + "step": 4792, + "time_per_iteration": 2.945852756500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067274, + "balance_loss_mlp": 1.05317199, + "epoch": 0.922085417468257, + "flos": 453036934656.0, + "grad_norm": 0.08745531559957272, + "language_loss": 0.80308163, + "learning_rate": 1.5835735232992032e-05, + "loss": 0.81375438, + "num_input_tokens_seen": 396599984, + "router_z_loss_mlp": 0.14086914, + "step": 4793, + "time_per_iteration": 2.528007984161377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062006, + "balance_loss_mlp": 1.04761744, + "epoch": 0.9222777991535206, + "flos": 500249405952.0, + "grad_norm": 0.07470871936442788, + "language_loss": 0.85101461, + "learning_rate": 1.575804349061616e-05, + "loss": 0.86163461, + "num_input_tokens_seen": 396664592, + "router_z_loss_mlp": 0.14367676, + "step": 4794, + "time_per_iteration": 2.5768916606903076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069107, + "balance_loss_mlp": 1.0545758, + "epoch": 0.9224701808387842, + "flos": 527959669248.0, + "grad_norm": 0.07688197326977388, + "language_loss": 0.78963321, + "learning_rate": 1.5680539747640722e-05, + "loss": 0.80032432, + "num_input_tokens_seen": 396729472, + "router_z_loss_mlp": 0.14550781, + "step": 4795, + "time_per_iteration": 2.5921027660369873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067432, + "balance_loss_mlp": 1.05369949, + "epoch": 0.9226625625240477, + "flos": 874640623104.0, + "grad_norm": 0.06384518887358884, + "language_loss": 0.75556517, + "learning_rate": 1.5603224034155315e-05, + "loss": 0.7662394, + "num_input_tokens_seen": 396810384, + "router_z_loss_mlp": 0.13757324, + "step": 4796, + "time_per_iteration": 3.133517026901245 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069777, + "balance_loss_mlp": 1.05577016, + "epoch": 0.9228549442093112, + "flos": 502774594560.0, + "grad_norm": 0.07425958905143133, + "language_loss": 0.87898898, + "learning_rate": 1.5526096380176657e-05, + "loss": 0.88968676, + "num_input_tokens_seen": 396875472, + "router_z_loss_mlp": 0.14013672, + "step": 4797, + "time_per_iteration": 2.5944347381591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01058503, + "balance_loss_mlp": 1.04417384, + "epoch": 0.9230473258945748, + "flos": 599989026816.0, + "grad_norm": 0.06949272728529254, + "language_loss": 0.85180724, + "learning_rate": 1.544915681564829e-05, + "loss": 0.86239231, + "num_input_tokens_seen": 396949888, + "router_z_loss_mlp": 0.14331055, + "step": 4798, + "time_per_iteration": 2.866840362548828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01059378, + "balance_loss_mlp": 1.04494166, + "epoch": 0.9232397075798384, + "flos": 822508826112.0, + "grad_norm": 0.09329142732010037, + "language_loss": 0.79354167, + "learning_rate": 1.5372405370440822e-05, + "loss": 0.8041355, + "num_input_tokens_seen": 397027504, + "router_z_loss_mlp": 0.14404297, + "step": 4799, + "time_per_iteration": 3.091614246368408 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068353, + "balance_loss_mlp": 1.05402422, + "epoch": 0.923432089265102, + "flos": 707030000640.0, + "grad_norm": 0.08846547031337017, + "language_loss": 0.84656245, + "learning_rate": 1.5295842074351805e-05, + "loss": 0.85724592, + "num_input_tokens_seen": 397101600, + "router_z_loss_mlp": 0.14343262, + "step": 4800, + "time_per_iteration": 2.9078805446624756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065128, + "balance_loss_mlp": 1.05058432, + "epoch": 0.9236244709503655, + "flos": 701861054976.0, + "grad_norm": 0.0965298832056426, + "language_loss": 0.76793849, + "learning_rate": 1.5219466957105798e-05, + "loss": 0.77858973, + "num_input_tokens_seen": 397170880, + "router_z_loss_mlp": 0.1451416, + "step": 4801, + "time_per_iteration": 2.9429283142089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067185, + "balance_loss_mlp": 1.05273724, + "epoch": 0.9238168526356291, + "flos": 515039689728.0, + "grad_norm": 0.07355444560642876, + "language_loss": 0.83979952, + "learning_rate": 1.5143280048354136e-05, + "loss": 0.85047144, + "num_input_tokens_seen": 397242272, + "router_z_loss_mlp": 0.14428711, + "step": 4802, + "time_per_iteration": 2.660900592803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064512, + "balance_loss_mlp": 1.04980135, + "epoch": 0.9240092343208927, + "flos": 492024098304.0, + "grad_norm": 0.07241857247571085, + "language_loss": 0.81500518, + "learning_rate": 1.5067281377675213e-05, + "loss": 0.82565027, + "num_input_tokens_seen": 397308032, + "router_z_loss_mlp": 0.14697266, + "step": 4803, + "time_per_iteration": 2.5832154750823975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106681, + "balance_loss_mlp": 1.05224264, + "epoch": 0.9242016160061562, + "flos": 647218750464.0, + "grad_norm": 0.06962127542934941, + "language_loss": 0.73689508, + "learning_rate": 1.4991470974574484e-05, + "loss": 0.74756318, + "num_input_tokens_seen": 397390944, + "router_z_loss_mlp": 0.14562988, + "step": 4804, + "time_per_iteration": 2.900785446166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065366, + "balance_loss_mlp": 1.05101275, + "epoch": 0.9243939976914197, + "flos": 729430354944.0, + "grad_norm": 0.07299214948717701, + "language_loss": 0.79122543, + "learning_rate": 1.4915848868484016e-05, + "loss": 0.80187905, + "num_input_tokens_seen": 397468128, + "router_z_loss_mlp": 0.14367676, + "step": 4805, + "time_per_iteration": 2.974085807800293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067059, + "balance_loss_mlp": 1.05249214, + "epoch": 0.9245863793766833, + "flos": 452246229504.0, + "grad_norm": 0.09554906471121519, + "language_loss": 0.90652919, + "learning_rate": 1.4840415088763048e-05, + "loss": 0.91719973, + "num_input_tokens_seen": 397538976, + "router_z_loss_mlp": 0.14550781, + "step": 4806, + "time_per_iteration": 2.6065399646759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066321, + "balance_loss_mlp": 1.05214715, + "epoch": 0.9247787610619469, + "flos": 755030605824.0, + "grad_norm": 0.08136491932055226, + "language_loss": 0.76982534, + "learning_rate": 1.476516966469732e-05, + "loss": 0.78048849, + "num_input_tokens_seen": 397612944, + "router_z_loss_mlp": 0.1418457, + "step": 4807, + "time_per_iteration": 2.940830945968628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066915, + "balance_loss_mlp": 1.05195403, + "epoch": 0.9249711427472105, + "flos": 561928389120.0, + "grad_norm": 0.06417953395011357, + "language_loss": 0.85199314, + "learning_rate": 1.4690112625499908e-05, + "loss": 0.86266232, + "num_input_tokens_seen": 397690848, + "router_z_loss_mlp": 0.14953613, + "step": 4808, + "time_per_iteration": 2.771059274673462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.05274093, + "epoch": 0.9251635244324741, + "flos": 526699459584.0, + "grad_norm": 0.06608853421706948, + "language_loss": 0.85035574, + "learning_rate": 1.4615244000310501e-05, + "loss": 0.86103106, + "num_input_tokens_seen": 397761008, + "router_z_loss_mlp": 0.14770508, + "step": 4809, + "time_per_iteration": 2.694859266281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064571, + "balance_loss_mlp": 1.05006337, + "epoch": 0.9253559061177375, + "flos": 611280608256.0, + "grad_norm": 0.07734863972631102, + "language_loss": 0.79164314, + "learning_rate": 1.4540563818195685e-05, + "loss": 0.80228883, + "num_input_tokens_seen": 397840640, + "router_z_loss_mlp": 0.14501953, + "step": 4810, + "time_per_iteration": 2.813447952270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008835, + "balance_loss_mlp": 1.00258815, + "epoch": 0.9255482878030011, + "flos": 1551258957312.0, + "grad_norm": 0.006837733446229171, + "language_loss": 0.76925391, + "learning_rate": 1.446607210814882e-05, + "loss": 0.77934223, + "num_input_tokens_seen": 398060096, + "router_z_loss_mlp": 0.0625, + "step": 4811, + "time_per_iteration": 4.736983299255371 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072178, + "balance_loss_mlp": 1.05792069, + "epoch": 0.9257406694882647, + "flos": 766366603776.0, + "grad_norm": 0.08162744233386064, + "language_loss": 0.80772638, + "learning_rate": 1.4391768899090219e-05, + "loss": 0.81844819, + "num_input_tokens_seen": 398143680, + "router_z_loss_mlp": 0.14245605, + "step": 4812, + "time_per_iteration": 3.054450750350952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_mlp": 1.05143118, + "epoch": 0.9259330511735283, + "flos": 497991089664.0, + "grad_norm": 0.08374138374324222, + "language_loss": 0.83075398, + "learning_rate": 1.431765421986686e-05, + "loss": 0.84141165, + "num_input_tokens_seen": 398207056, + "router_z_loss_mlp": 0.14331055, + "step": 4813, + "time_per_iteration": 2.639411687850952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067481, + "balance_loss_mlp": 1.05308032, + "epoch": 0.9261254328587919, + "flos": 626874080256.0, + "grad_norm": 0.08153883506876486, + "language_loss": 0.79092741, + "learning_rate": 1.424372809925273e-05, + "loss": 0.80160224, + "num_input_tokens_seen": 398277472, + "router_z_loss_mlp": 0.14379883, + "step": 4814, + "time_per_iteration": 2.75715708732605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067664, + "balance_loss_mlp": 1.0535382, + "epoch": 0.9263178145440554, + "flos": 597382345728.0, + "grad_norm": 0.07799817914897651, + "language_loss": 0.85397398, + "learning_rate": 1.416999056594831e-05, + "loss": 0.86465067, + "num_input_tokens_seen": 398346544, + "router_z_loss_mlp": 0.14135742, + "step": 4815, + "time_per_iteration": 2.766474723815918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068843, + "balance_loss_mlp": 1.05474079, + "epoch": 0.926510196229319, + "flos": 388563319296.0, + "grad_norm": 0.09007822488633566, + "language_loss": 0.83693337, + "learning_rate": 1.4096441648581259e-05, + "loss": 0.84762168, + "num_input_tokens_seen": 398409344, + "router_z_loss_mlp": 0.14099121, + "step": 4816, + "time_per_iteration": 2.4716956615448 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067782, + "balance_loss_mlp": 1.05334568, + "epoch": 0.9267025779145825, + "flos": 545798974464.0, + "grad_norm": 0.09455897697825383, + "language_loss": 0.84119844, + "learning_rate": 1.4023081375705737e-05, + "loss": 0.85187626, + "num_input_tokens_seen": 398478816, + "router_z_loss_mlp": 0.14404297, + "step": 4817, + "time_per_iteration": 2.6396780014038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070795, + "balance_loss_mlp": 1.05666864, + "epoch": 0.9268949595998461, + "flos": 499789813248.0, + "grad_norm": 0.06599557905688819, + "language_loss": 0.82125562, + "learning_rate": 1.3949909775802682e-05, + "loss": 0.8319636, + "num_input_tokens_seen": 398550384, + "router_z_loss_mlp": 0.14135742, + "step": 4818, + "time_per_iteration": 2.636826992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063864, + "balance_loss_mlp": 1.04934382, + "epoch": 0.9270873412851096, + "flos": 432828085248.0, + "grad_norm": 0.07547927657855338, + "language_loss": 0.82790685, + "learning_rate": 1.3876926877279817e-05, + "loss": 0.83854544, + "num_input_tokens_seen": 398620832, + "router_z_loss_mlp": 0.14501953, + "step": 4819, + "time_per_iteration": 2.6638593673706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065963, + "balance_loss_mlp": 1.05182457, + "epoch": 0.9272797229703732, + "flos": 466769640960.0, + "grad_norm": 0.08095696097618853, + "language_loss": 0.85950172, + "learning_rate": 1.380413270847164e-05, + "loss": 0.87016135, + "num_input_tokens_seen": 398689776, + "router_z_loss_mlp": 0.14135742, + "step": 4820, + "time_per_iteration": 2.61427640914917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065047, + "balance_loss_mlp": 1.05034828, + "epoch": 0.9274721046556368, + "flos": 704838122496.0, + "grad_norm": 0.1143373628903449, + "language_loss": 0.79004455, + "learning_rate": 1.373152729763938e-05, + "loss": 0.800695, + "num_input_tokens_seen": 398775072, + "router_z_loss_mlp": 0.14672852, + "step": 4821, + "time_per_iteration": 3.046144723892212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008904, + "balance_loss_mlp": 1.00265718, + "epoch": 0.9276644863409004, + "flos": 1402255950336.0, + "grad_norm": 0.006840762766732248, + "language_loss": 0.82380462, + "learning_rate": 1.3659110672970931e-05, + "loss": 0.83389366, + "num_input_tokens_seen": 399002016, + "router_z_loss_mlp": 0.0625, + "step": 4822, + "time_per_iteration": 4.890657901763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065245, + "balance_loss_mlp": 1.05101109, + "epoch": 0.927856868026164, + "flos": 741722614272.0, + "grad_norm": 0.07071256665988469, + "language_loss": 0.80128741, + "learning_rate": 1.3586882862580917e-05, + "loss": 0.81193984, + "num_input_tokens_seen": 399085808, + "router_z_loss_mlp": 0.14245605, + "step": 4823, + "time_per_iteration": 3.0425524711608887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068987, + "balance_loss_mlp": 1.05459857, + "epoch": 0.9280492497114274, + "flos": 412223883264.0, + "grad_norm": 0.08552550335100627, + "language_loss": 0.73997277, + "learning_rate": 1.3514843894510686e-05, + "loss": 0.75066262, + "num_input_tokens_seen": 399146768, + "router_z_loss_mlp": 0.14367676, + "step": 4824, + "time_per_iteration": 2.5228898525238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066643, + "balance_loss_mlp": 1.05221891, + "epoch": 0.928241631396691, + "flos": 646504768512.0, + "grad_norm": 0.1058125688728975, + "language_loss": 0.83748496, + "learning_rate": 1.3442993796728254e-05, + "loss": 0.84815139, + "num_input_tokens_seen": 399220192, + "router_z_loss_mlp": 0.14428711, + "step": 4825, + "time_per_iteration": 2.7949647903442383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066615, + "balance_loss_mlp": 1.0524534, + "epoch": 0.9284340130819546, + "flos": 696855094272.0, + "grad_norm": 0.06762366006389377, + "language_loss": 0.80860943, + "learning_rate": 1.3371332597128249e-05, + "loss": 0.81927556, + "num_input_tokens_seen": 399300064, + "router_z_loss_mlp": 0.14154053, + "step": 4826, + "time_per_iteration": 3.0083041191101074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_mlp": 1.05130076, + "epoch": 0.9286263947672182, + "flos": 759132062208.0, + "grad_norm": 0.058538603335969094, + "language_loss": 0.83601272, + "learning_rate": 1.3299860323532032e-05, + "loss": 0.84667104, + "num_input_tokens_seen": 399383200, + "router_z_loss_mlp": 0.14538574, + "step": 4827, + "time_per_iteration": 3.053251266479492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106725, + "balance_loss_mlp": 1.05277789, + "epoch": 0.9288187764524817, + "flos": 672823770624.0, + "grad_norm": 0.07416078307622533, + "language_loss": 0.80154818, + "learning_rate": 1.3228577003687681e-05, + "loss": 0.81222069, + "num_input_tokens_seen": 399466400, + "router_z_loss_mlp": 0.14465332, + "step": 4828, + "time_per_iteration": 2.9393198490142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070143, + "balance_loss_mlp": 1.05600464, + "epoch": 0.9290111581377453, + "flos": 500469290496.0, + "grad_norm": 0.08130482924978269, + "language_loss": 0.83759892, + "learning_rate": 1.3157482665269727e-05, + "loss": 0.84830034, + "num_input_tokens_seen": 399533504, + "router_z_loss_mlp": 0.14135742, + "step": 4829, + "time_per_iteration": 2.5792195796966553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008805, + "balance_loss_mlp": 1.00255847, + "epoch": 0.9292035398230089, + "flos": 1563627566592.0, + "grad_norm": 0.006851389377426983, + "language_loss": 0.72122061, + "learning_rate": 1.3086577335879424e-05, + "loss": 0.7313087, + "num_input_tokens_seen": 399769872, + "router_z_loss_mlp": 0.0625, + "step": 4830, + "time_per_iteration": 4.982414722442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008797, + "balance_loss_mlp": 1.00255001, + "epoch": 0.9293959215082724, + "flos": 1518673411584.0, + "grad_norm": 0.006852258928936433, + "language_loss": 0.79511833, + "learning_rate": 1.3015861043044753e-05, + "loss": 0.8052063, + "num_input_tokens_seen": 399997760, + "router_z_loss_mlp": 0.0625, + "step": 4831, + "time_per_iteration": 4.867983341217041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064215, + "balance_loss_mlp": 1.04946923, + "epoch": 0.929588303193536, + "flos": 557836844544.0, + "grad_norm": 0.08964822513269201, + "language_loss": 0.83914268, + "learning_rate": 1.2945333814220195e-05, + "loss": 0.84978479, + "num_input_tokens_seen": 400063872, + "router_z_loss_mlp": 0.14733887, + "step": 4832, + "time_per_iteration": 2.731626272201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067018, + "balance_loss_mlp": 1.05259395, + "epoch": 0.9297806848787995, + "flos": 478580285952.0, + "grad_norm": 0.08096360124602932, + "language_loss": 0.80279034, + "learning_rate": 1.2874995676786905e-05, + "loss": 0.81346047, + "num_input_tokens_seen": 400126064, + "router_z_loss_mlp": 0.14416504, + "step": 4833, + "time_per_iteration": 2.5306105613708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069828, + "balance_loss_mlp": 1.05523705, + "epoch": 0.9299730665640631, + "flos": 564537641472.0, + "grad_norm": 0.07716391645519798, + "language_loss": 0.80337012, + "learning_rate": 1.2804846658052372e-05, + "loss": 0.81406838, + "num_input_tokens_seen": 400201776, + "router_z_loss_mlp": 0.14587402, + "step": 4834, + "time_per_iteration": 2.769162654876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067198, + "balance_loss_mlp": 1.05315518, + "epoch": 0.9301654482493267, + "flos": 560174082048.0, + "grad_norm": 0.08258292328826174, + "language_loss": 0.82621527, + "learning_rate": 1.2734886785251032e-05, + "loss": 0.83688718, + "num_input_tokens_seen": 400279504, + "router_z_loss_mlp": 0.14050293, + "step": 4835, + "time_per_iteration": 2.7922258377075195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008368, + "balance_loss_mlp": 1.00212157, + "epoch": 0.9303578299345903, + "flos": 1520096606208.0, + "grad_norm": 0.006037144281499386, + "language_loss": 0.76852441, + "learning_rate": 1.2665116085543715e-05, + "loss": 0.77860808, + "num_input_tokens_seen": 400514800, + "router_z_loss_mlp": 0.0625, + "step": 4836, + "time_per_iteration": 5.028789281845093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106964, + "balance_loss_mlp": 1.05518019, + "epoch": 0.9305502116198537, + "flos": 530843134464.0, + "grad_norm": 0.0847586307722568, + "language_loss": 0.82820576, + "learning_rate": 1.2595534586017698e-05, + "loss": 0.83890218, + "num_input_tokens_seen": 400582640, + "router_z_loss_mlp": 0.14477539, + "step": 4837, + "time_per_iteration": 2.7545266151428223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106575, + "balance_loss_mlp": 1.05152798, + "epoch": 0.9307425933051173, + "flos": 474898775040.0, + "grad_norm": 0.10091492375500201, + "language_loss": 0.81374753, + "learning_rate": 1.2526142313686983e-05, + "loss": 0.82440501, + "num_input_tokens_seen": 400646912, + "router_z_loss_mlp": 0.14221191, + "step": 4838, + "time_per_iteration": 2.5650086402893066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066794, + "balance_loss_mlp": 1.05262041, + "epoch": 0.9309349749903809, + "flos": 584892223488.0, + "grad_norm": 0.06802795816688911, + "language_loss": 0.86762273, + "learning_rate": 1.245693929549213e-05, + "loss": 0.87829065, + "num_input_tokens_seen": 400722128, + "router_z_loss_mlp": 0.14172363, + "step": 4839, + "time_per_iteration": 2.7399027347564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067935, + "balance_loss_mlp": 1.05323696, + "epoch": 0.9311273566756445, + "flos": 861666315264.0, + "grad_norm": 0.061246816450390304, + "language_loss": 0.76902699, + "learning_rate": 1.2387925558299984e-05, + "loss": 0.77970636, + "num_input_tokens_seen": 400801440, + "router_z_loss_mlp": 0.14697266, + "step": 4840, + "time_per_iteration": 3.094343662261963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064483, + "balance_loss_mlp": 1.05036891, + "epoch": 0.9313197383609081, + "flos": 548094366720.0, + "grad_norm": 0.07423165545270044, + "language_loss": 0.82531536, + "learning_rate": 1.231910112890411e-05, + "loss": 0.83596015, + "num_input_tokens_seen": 400873008, + "router_z_loss_mlp": 0.14111328, + "step": 4841, + "time_per_iteration": 2.644543409347534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067111, + "balance_loss_mlp": 1.05234146, + "epoch": 0.9315121200461716, + "flos": 468756315648.0, + "grad_norm": 0.07934461180224898, + "language_loss": 0.80920649, + "learning_rate": 1.2250466034024522e-05, + "loss": 0.81987751, + "num_input_tokens_seen": 400935328, + "router_z_loss_mlp": 0.14746094, + "step": 4842, + "time_per_iteration": 2.531503200531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010695, + "balance_loss_mlp": 1.05520725, + "epoch": 0.9317045017314352, + "flos": 417659701248.0, + "grad_norm": 0.09122978620676214, + "language_loss": 0.77761424, + "learning_rate": 1.2182020300307684e-05, + "loss": 0.78830928, + "num_input_tokens_seen": 401000720, + "router_z_loss_mlp": 0.14294434, + "step": 4843, + "time_per_iteration": 2.5434539318084717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066121, + "balance_loss_mlp": 1.0517211, + "epoch": 0.9318968834166987, + "flos": 540489065472.0, + "grad_norm": 0.07745490601900848, + "language_loss": 0.77002335, + "learning_rate": 1.2113763954326729e-05, + "loss": 0.78068453, + "num_input_tokens_seen": 401079664, + "router_z_loss_mlp": 0.14379883, + "step": 4844, + "time_per_iteration": 2.75764536857605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071049, + "balance_loss_mlp": 1.05657732, + "epoch": 0.9320892651019623, + "flos": 521330452992.0, + "grad_norm": 0.0723719710231409, + "language_loss": 0.80705845, + "learning_rate": 1.2045697022581015e-05, + "loss": 0.81776899, + "num_input_tokens_seen": 401146160, + "router_z_loss_mlp": 0.14465332, + "step": 4845, + "time_per_iteration": 2.638796329498291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066086, + "balance_loss_mlp": 1.05176866, + "epoch": 0.9322816467872258, + "flos": 582072998400.0, + "grad_norm": 0.08462986407031685, + "language_loss": 0.80856788, + "learning_rate": 1.1977819531496348e-05, + "loss": 0.81922877, + "num_input_tokens_seen": 401223264, + "router_z_loss_mlp": 0.14294434, + "step": 4846, + "time_per_iteration": 2.7654807567596436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068849, + "balance_loss_mlp": 1.05425739, + "epoch": 0.9324740284724894, + "flos": 484747338240.0, + "grad_norm": 0.06874284999019256, + "language_loss": 0.81851065, + "learning_rate": 1.191013150742537e-05, + "loss": 0.82919914, + "num_input_tokens_seen": 401296368, + "router_z_loss_mlp": 0.14562988, + "step": 4847, + "time_per_iteration": 2.7250354290008545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065082, + "balance_loss_mlp": 1.05045485, + "epoch": 0.932666410157753, + "flos": 732585461760.0, + "grad_norm": 0.07610709588397915, + "language_loss": 0.82762969, + "learning_rate": 1.1842632976646672e-05, + "loss": 0.8382805, + "num_input_tokens_seen": 401383936, + "router_z_loss_mlp": 0.14599609, + "step": 4848, + "time_per_iteration": 3.0495240688323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061882, + "balance_loss_mlp": 1.04709959, + "epoch": 0.9328587918430166, + "flos": 965537127936.0, + "grad_norm": 0.0681296459955147, + "language_loss": 0.78606904, + "learning_rate": 1.1775323965365681e-05, + "loss": 0.7966879, + "num_input_tokens_seen": 401468784, + "router_z_loss_mlp": 0.14758301, + "step": 4849, + "time_per_iteration": 3.258883476257324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067459, + "balance_loss_mlp": 1.05304718, + "epoch": 0.9330511735282802, + "flos": 614552085504.0, + "grad_norm": 0.07341597020423554, + "language_loss": 0.8031379, + "learning_rate": 1.1708204499713936e-05, + "loss": 0.81381249, + "num_input_tokens_seen": 401539712, + "router_z_loss_mlp": 0.14416504, + "step": 4850, + "time_per_iteration": 2.7913970947265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067673, + "balance_loss_mlp": 1.05315292, + "epoch": 0.9332435552135436, + "flos": 559101823488.0, + "grad_norm": 0.06951719849091244, + "language_loss": 0.85507822, + "learning_rate": 1.1641274605749653e-05, + "loss": 0.8657549, + "num_input_tokens_seen": 401610432, + "router_z_loss_mlp": 0.14489746, + "step": 4851, + "time_per_iteration": 2.708566427230835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063138, + "balance_loss_mlp": 1.04864168, + "epoch": 0.9334359368988072, + "flos": 515536358400.0, + "grad_norm": 0.08442635814792841, + "language_loss": 0.81517386, + "learning_rate": 1.1574534309457208e-05, + "loss": 0.82580519, + "num_input_tokens_seen": 401677344, + "router_z_loss_mlp": 0.14501953, + "step": 4852, + "time_per_iteration": 2.599057912826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062002, + "balance_loss_mlp": 1.04773307, + "epoch": 0.9336283185840708, + "flos": 539809588224.0, + "grad_norm": 0.09818611570102258, + "language_loss": 0.82896936, + "learning_rate": 1.1507983636747488e-05, + "loss": 0.83958936, + "num_input_tokens_seen": 401756864, + "router_z_loss_mlp": 0.1427002, + "step": 4853, + "time_per_iteration": 2.7612810134887695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008076, + "balance_loss_mlp": 1.00182903, + "epoch": 0.9338207002693344, + "flos": 1562824751616.0, + "grad_norm": 0.0067103682238986335, + "language_loss": 0.78455019, + "learning_rate": 1.1441622613457824e-05, + "loss": 0.79463089, + "num_input_tokens_seen": 401983664, + "router_z_loss_mlp": 0.06225586, + "step": 4854, + "time_per_iteration": 4.886642694473267 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065823, + "balance_loss_mlp": 1.05126715, + "epoch": 0.9340130819545979, + "flos": 645261811200.0, + "grad_norm": 0.05765482592606577, + "language_loss": 0.81226462, + "learning_rate": 1.1375451265351833e-05, + "loss": 0.82292283, + "num_input_tokens_seen": 402065744, + "router_z_loss_mlp": 0.14550781, + "step": 4855, + "time_per_iteration": 2.954638957977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063009, + "balance_loss_mlp": 1.0485847, + "epoch": 0.9342054636398615, + "flos": 503441588736.0, + "grad_norm": 0.06536242682566035, + "language_loss": 0.76978171, + "learning_rate": 1.1309469618119516e-05, + "loss": 0.78041184, + "num_input_tokens_seen": 402137728, + "router_z_loss_mlp": 0.14416504, + "step": 4856, + "time_per_iteration": 2.706509828567505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068073, + "balance_loss_mlp": 1.05385113, + "epoch": 0.934397845325125, + "flos": 593026126848.0, + "grad_norm": 0.06559583357944072, + "language_loss": 0.84238493, + "learning_rate": 1.1243677697377109e-05, + "loss": 0.85306573, + "num_input_tokens_seen": 402220160, + "router_z_loss_mlp": 0.14233398, + "step": 4857, + "time_per_iteration": 2.8610572814941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068248, + "balance_loss_mlp": 1.05399108, + "epoch": 0.9345902270103886, + "flos": 499891129344.0, + "grad_norm": 0.07255060055784338, + "language_loss": 0.80180097, + "learning_rate": 1.1178075528667453e-05, + "loss": 0.81248355, + "num_input_tokens_seen": 402285168, + "router_z_loss_mlp": 0.1427002, + "step": 4858, + "time_per_iteration": 2.552783727645874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008126, + "balance_loss_mlp": 1.00187981, + "epoch": 0.9347826086956522, + "flos": 1520329347072.0, + "grad_norm": 0.006716178766718662, + "language_loss": 0.7598772, + "learning_rate": 1.1112663137459566e-05, + "loss": 0.7699585, + "num_input_tokens_seen": 402504912, + "router_z_loss_mlp": 0.06225586, + "step": 4859, + "time_per_iteration": 4.672068357467651 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068402, + "balance_loss_mlp": 1.05360794, + "epoch": 0.9349749903809157, + "flos": 504550923264.0, + "grad_norm": 0.06024273596411253, + "language_loss": 0.81101823, + "learning_rate": 1.1047440549148636e-05, + "loss": 0.8217023, + "num_input_tokens_seen": 402582032, + "router_z_loss_mlp": 0.14782715, + "step": 4860, + "time_per_iteration": 2.807316541671753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065413, + "balance_loss_mlp": 1.05103672, + "epoch": 0.9351673720661793, + "flos": 568901200896.0, + "grad_norm": 0.09837835875374011, + "language_loss": 0.78307474, + "learning_rate": 1.0982407789056514e-05, + "loss": 0.79372889, + "num_input_tokens_seen": 402650144, + "router_z_loss_mlp": 0.14367676, + "step": 4861, + "time_per_iteration": 2.6451520919799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066536, + "balance_loss_mlp": 1.05227828, + "epoch": 0.9353597537514429, + "flos": 544605576192.0, + "grad_norm": 0.07168847112729194, + "language_loss": 0.86299908, + "learning_rate": 1.0917564882430952e-05, + "loss": 0.8736645, + "num_input_tokens_seen": 402720368, + "router_z_loss_mlp": 0.1427002, + "step": 4862, + "time_per_iteration": 2.6640143394470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060222, + "balance_loss_mlp": 1.04584587, + "epoch": 0.9355521354367065, + "flos": 518997984768.0, + "grad_norm": 0.06539054611804387, + "language_loss": 0.8488996, + "learning_rate": 1.0852911854446368e-05, + "loss": 0.85950184, + "num_input_tokens_seen": 402795568, + "router_z_loss_mlp": 0.14379883, + "step": 4863, + "time_per_iteration": 2.8074288368225098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068234, + "balance_loss_mlp": 1.05423915, + "epoch": 0.93574451712197, + "flos": 446316314112.0, + "grad_norm": 0.07358102996376413, + "language_loss": 0.7843712, + "learning_rate": 1.0788448730203237e-05, + "loss": 0.79505354, + "num_input_tokens_seen": 402858784, + "router_z_loss_mlp": 0.13989258, + "step": 4864, + "time_per_iteration": 2.5553953647613525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066209, + "balance_loss_mlp": 1.0520947, + "epoch": 0.9359368988072335, + "flos": 480517401600.0, + "grad_norm": 0.0873526788853267, + "language_loss": 0.76845741, + "learning_rate": 1.072417553472832e-05, + "loss": 0.77911949, + "num_input_tokens_seen": 402924144, + "router_z_loss_mlp": 0.14135742, + "step": 4865, + "time_per_iteration": 2.538104295730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064436, + "balance_loss_mlp": 1.05001128, + "epoch": 0.9361292804924971, + "flos": 497118892032.0, + "grad_norm": 0.07011272694309466, + "language_loss": 0.85173476, + "learning_rate": 1.0660092292974766e-05, + "loss": 0.86237907, + "num_input_tokens_seen": 402987488, + "router_z_loss_mlp": 0.1439209, + "step": 4866, + "time_per_iteration": 2.601087808609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062752, + "balance_loss_mlp": 1.04833984, + "epoch": 0.9363216621777607, + "flos": 618122368512.0, + "grad_norm": 0.06413466632089472, + "language_loss": 0.84145504, + "learning_rate": 1.059619902982184e-05, + "loss": 0.85208255, + "num_input_tokens_seen": 403058224, + "router_z_loss_mlp": 0.14416504, + "step": 4867, + "time_per_iteration": 2.771540403366089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008072, + "balance_loss_mlp": 1.00182533, + "epoch": 0.9365140438630243, + "flos": 1415929559040.0, + "grad_norm": 0.006714289300712873, + "language_loss": 0.79203337, + "learning_rate": 1.053249577007509e-05, + "loss": 0.80211407, + "num_input_tokens_seen": 403289072, + "router_z_loss_mlp": 0.06225586, + "step": 4868, + "time_per_iteration": 4.87546706199646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106451, + "balance_loss_mlp": 1.05019283, + "epoch": 0.9367064255482878, + "flos": 590503509504.0, + "grad_norm": 0.07280245758822038, + "language_loss": 0.81747389, + "learning_rate": 1.0468982538466287e-05, + "loss": 0.82811898, + "num_input_tokens_seen": 403361728, + "router_z_loss_mlp": 0.14294434, + "step": 4869, + "time_per_iteration": 2.725703477859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_mlp": 1.05178761, + "epoch": 0.9368988072335513, + "flos": 526637790720.0, + "grad_norm": 0.08059688875946759, + "language_loss": 0.81905812, + "learning_rate": 1.0405659359653597e-05, + "loss": 0.82972252, + "num_input_tokens_seen": 403431536, + "router_z_loss_mlp": 0.14624023, + "step": 4870, + "time_per_iteration": 2.6715352535247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065493, + "balance_loss_mlp": 1.05104446, + "epoch": 0.9370911889188149, + "flos": 743205279744.0, + "grad_norm": 0.07748687060476699, + "language_loss": 0.78810263, + "learning_rate": 1.034252625822113e-05, + "loss": 0.79875755, + "num_input_tokens_seen": 403504768, + "router_z_loss_mlp": 0.14428711, + "step": 4871, + "time_per_iteration": 2.916724443435669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067142, + "balance_loss_mlp": 1.05271745, + "epoch": 0.9372835706040785, + "flos": 546038682624.0, + "grad_norm": 0.07044828072534959, + "language_loss": 0.78682631, + "learning_rate": 1.0279583258679448e-05, + "loss": 0.79749775, + "num_input_tokens_seen": 403575584, + "router_z_loss_mlp": 0.14404297, + "step": 4872, + "time_per_iteration": 2.6422388553619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065911, + "balance_loss_mlp": 1.05154586, + "epoch": 0.9374759522893421, + "flos": 491633515008.0, + "grad_norm": 0.08201625964477643, + "language_loss": 0.81656861, + "learning_rate": 1.0216830385465003e-05, + "loss": 0.82722771, + "num_input_tokens_seen": 403648720, + "router_z_loss_mlp": 0.14343262, + "step": 4873, + "time_per_iteration": 2.7001724243164062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107035, + "balance_loss_mlp": 1.055902, + "epoch": 0.9376683339746056, + "flos": 578421222912.0, + "grad_norm": 0.07557342334959853, + "language_loss": 0.82583576, + "learning_rate": 1.0154267662940809e-05, + "loss": 0.83653927, + "num_input_tokens_seen": 403721392, + "router_z_loss_mlp": 0.14428711, + "step": 4874, + "time_per_iteration": 2.7377657890319824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065743, + "balance_loss_mlp": 1.05108047, + "epoch": 0.9378607156598692, + "flos": 506290549248.0, + "grad_norm": 0.0827488528047596, + "language_loss": 0.8048206, + "learning_rate": 1.0091895115395766e-05, + "loss": 0.81547809, + "num_input_tokens_seen": 403792112, + "router_z_loss_mlp": 0.1463623, + "step": 4875, + "time_per_iteration": 2.713914632797241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_mlp": 1.04972827, + "epoch": 0.9380530973451328, + "flos": 520015915008.0, + "grad_norm": 0.09685567377493352, + "language_loss": 0.77720559, + "learning_rate": 1.0029712767045062e-05, + "loss": 0.7878508, + "num_input_tokens_seen": 403860928, + "router_z_loss_mlp": 0.14782715, + "step": 4876, + "time_per_iteration": 2.6722288131713867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062474, + "balance_loss_mlp": 1.04804993, + "epoch": 0.9382454790303963, + "flos": 557799768576.0, + "grad_norm": 0.06317554593447856, + "language_loss": 0.84759539, + "learning_rate": 9.967720642029999e-06, + "loss": 0.8582201, + "num_input_tokens_seen": 403928240, + "router_z_loss_mlp": 0.14416504, + "step": 4877, + "time_per_iteration": 2.759575128555298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064011, + "balance_loss_mlp": 1.04950309, + "epoch": 0.9384378607156598, + "flos": 695476316160.0, + "grad_norm": 0.07554619114049714, + "language_loss": 0.81792021, + "learning_rate": 9.905918764418153e-06, + "loss": 0.82856029, + "num_input_tokens_seen": 404004320, + "router_z_loss_mlp": 0.14489746, + "step": 4878, + "time_per_iteration": 2.9286370277404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065892, + "balance_loss_mlp": 1.05136013, + "epoch": 0.9386302424009234, + "flos": 554750747136.0, + "grad_norm": 0.07984933040199384, + "language_loss": 0.80986464, + "learning_rate": 9.844307158203058e-06, + "loss": 0.82052362, + "num_input_tokens_seen": 404077040, + "router_z_loss_mlp": 0.1451416, + "step": 4879, + "time_per_iteration": 2.6613898277282715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_mlp": 1.04887724, + "epoch": 0.938822624086187, + "flos": 566981337600.0, + "grad_norm": 0.08367891448674436, + "language_loss": 0.79728901, + "learning_rate": 9.782885847304469e-06, + "loss": 0.80792373, + "num_input_tokens_seen": 404145248, + "router_z_loss_mlp": 0.14587402, + "step": 4880, + "time_per_iteration": 2.7297160625457764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067114, + "balance_loss_mlp": 1.05274892, + "epoch": 0.9390150057714506, + "flos": 417602801664.0, + "grad_norm": 0.07679866362319365, + "language_loss": 0.80293953, + "learning_rate": 9.721654855568196e-06, + "loss": 0.81361073, + "num_input_tokens_seen": 404212000, + "router_z_loss_mlp": 0.14367676, + "step": 4881, + "time_per_iteration": 2.5838916301727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070302, + "balance_loss_mlp": 1.05568695, + "epoch": 0.9392073874567142, + "flos": 1553839967232.0, + "grad_norm": 0.14673478335081816, + "language_loss": 0.76342237, + "learning_rate": 9.660614206766394e-06, + "loss": 0.7741254, + "num_input_tokens_seen": 404305408, + "router_z_loss_mlp": 0.14599609, + "step": 4882, + "time_per_iteration": 3.6900463104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068127, + "balance_loss_mlp": 1.05361927, + "epoch": 0.9393997691419776, + "flos": 652536000000.0, + "grad_norm": 0.07340756256614964, + "language_loss": 0.78028488, + "learning_rate": 9.59976392459705e-06, + "loss": 0.79096615, + "num_input_tokens_seen": 404383248, + "router_z_loss_mlp": 0.14489746, + "step": 4883, + "time_per_iteration": 2.76796817779541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100936, + "balance_loss_mlp": 1.00316095, + "epoch": 0.9395921508272412, + "flos": 1553294817792.0, + "grad_norm": 0.00609237494033278, + "language_loss": 0.78170681, + "learning_rate": 9.539104032684209e-06, + "loss": 0.79180038, + "num_input_tokens_seen": 404615264, + "router_z_loss_mlp": 0.06201172, + "step": 4884, + "time_per_iteration": 4.855507850646973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068527, + "balance_loss_mlp": 1.05419779, + "epoch": 0.9397845325125048, + "flos": 498144162816.0, + "grad_norm": 0.06608100725405705, + "language_loss": 0.78651726, + "learning_rate": 9.478634554578314e-06, + "loss": 0.79720247, + "num_input_tokens_seen": 404684656, + "router_z_loss_mlp": 0.14318848, + "step": 4885, + "time_per_iteration": 2.647665023803711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066562, + "balance_loss_mlp": 1.05208969, + "epoch": 0.9399769141977684, + "flos": 498596414976.0, + "grad_norm": 0.07655444770073823, + "language_loss": 0.8362307, + "learning_rate": 9.418355513755638e-06, + "loss": 0.84689629, + "num_input_tokens_seen": 404752096, + "router_z_loss_mlp": 0.14465332, + "step": 4886, + "time_per_iteration": 2.6135685443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010084, + "balance_loss_mlp": 1.00388551, + "epoch": 0.9401692958830319, + "flos": 1402500427776.0, + "grad_norm": 0.007184013701095998, + "language_loss": 0.79332191, + "learning_rate": 9.358266933618575e-06, + "loss": 0.80342275, + "num_input_tokens_seen": 404980944, + "router_z_loss_mlp": 0.06201172, + "step": 4887, + "time_per_iteration": 4.774345397949219 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066371, + "balance_loss_mlp": 1.05168462, + "epoch": 0.9403616775682955, + "flos": 540123448320.0, + "grad_norm": 0.055656393961397786, + "language_loss": 0.84932435, + "learning_rate": 9.298368837495575e-06, + "loss": 0.85998809, + "num_input_tokens_seen": 405056688, + "router_z_loss_mlp": 0.14660645, + "step": 4888, + "time_per_iteration": 2.833160638809204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010135, + "balance_loss_mlp": 1.00388861, + "epoch": 0.9405540592535591, + "flos": 1322058184704.0, + "grad_norm": 0.007188675578583431, + "language_loss": 0.75169432, + "learning_rate": 9.238661248641089e-06, + "loss": 0.76179576, + "num_input_tokens_seen": 405284656, + "router_z_loss_mlp": 0.06225586, + "step": 4889, + "time_per_iteration": 4.915827989578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065193, + "balance_loss_mlp": 1.0505538, + "epoch": 0.9407464409388226, + "flos": 572362827264.0, + "grad_norm": 0.08497767098869012, + "language_loss": 0.82881129, + "learning_rate": 9.179144190235799e-06, + "loss": 0.83946323, + "num_input_tokens_seen": 405351584, + "router_z_loss_mlp": 0.14611816, + "step": 4890, + "time_per_iteration": 2.6498968601226807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066088, + "balance_loss_mlp": 1.05141306, + "epoch": 0.9409388226240862, + "flos": 511264203264.0, + "grad_norm": 0.06360484730349225, + "language_loss": 0.76604337, + "learning_rate": 9.119817685386112e-06, + "loss": 0.77670431, + "num_input_tokens_seen": 405425712, + "router_z_loss_mlp": 0.14648438, + "step": 4891, + "time_per_iteration": 2.7343337535858154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010139, + "balance_loss_mlp": 1.00389278, + "epoch": 0.9411312043093497, + "flos": 1569901077504.0, + "grad_norm": 0.00718633131099091, + "language_loss": 0.80241883, + "learning_rate": 9.06068175712471e-06, + "loss": 0.81252027, + "num_input_tokens_seen": 405655760, + "router_z_loss_mlp": 0.06225586, + "step": 4892, + "time_per_iteration": 4.940707206726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067624, + "balance_loss_mlp": 1.05330682, + "epoch": 0.9413235859946133, + "flos": 569469450240.0, + "grad_norm": 0.07938482085653319, + "language_loss": 0.78470445, + "learning_rate": 9.001736428410234e-06, + "loss": 0.79538065, + "num_input_tokens_seen": 405731664, + "router_z_loss_mlp": 0.14318848, + "step": 4893, + "time_per_iteration": 2.739201784133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068375, + "balance_loss_mlp": 1.05412984, + "epoch": 0.9415159676798769, + "flos": 781905747456.0, + "grad_norm": 0.07942221515797811, + "language_loss": 0.80200732, + "learning_rate": 8.942981722127263e-06, + "loss": 0.81269109, + "num_input_tokens_seen": 405808128, + "router_z_loss_mlp": 0.14257812, + "step": 4894, + "time_per_iteration": 3.002270460128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068942, + "balance_loss_mlp": 1.05450535, + "epoch": 0.9417083493651405, + "flos": 849341749248.0, + "grad_norm": 0.059340658547535424, + "language_loss": 0.79964054, + "learning_rate": 8.884417661086331e-06, + "loss": 0.81032991, + "num_input_tokens_seen": 405892448, + "router_z_loss_mlp": 0.14428711, + "step": 4895, + "time_per_iteration": 3.1609625816345215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065293, + "balance_loss_mlp": 1.0512265, + "epoch": 0.941900731050404, + "flos": 529333304832.0, + "grad_norm": 0.06768940884435448, + "language_loss": 0.85507524, + "learning_rate": 8.826044268024025e-06, + "loss": 0.86572814, + "num_input_tokens_seen": 405966736, + "router_z_loss_mlp": 0.14074707, + "step": 4896, + "time_per_iteration": 2.695668935775757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106449, + "balance_loss_mlp": 1.04998195, + "epoch": 0.9420931127356675, + "flos": 557073303552.0, + "grad_norm": 0.2444941012145158, + "language_loss": 0.80151224, + "learning_rate": 8.767861565602997e-06, + "loss": 0.81215715, + "num_input_tokens_seen": 406043264, + "router_z_loss_mlp": 0.14489746, + "step": 4897, + "time_per_iteration": 2.748248338699341 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064529, + "balance_loss_mlp": 1.05071259, + "epoch": 0.9422854944209311, + "flos": 652543340544.0, + "grad_norm": 0.0711327665807799, + "language_loss": 0.86498511, + "learning_rate": 8.709869576411733e-06, + "loss": 0.87563032, + "num_input_tokens_seen": 406119552, + "router_z_loss_mlp": 0.13830566, + "step": 4898, + "time_per_iteration": 2.826136827468872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067484, + "balance_loss_mlp": 1.053298, + "epoch": 0.9424778761061947, + "flos": 553685829120.0, + "grad_norm": 0.06562049351455196, + "language_loss": 0.83802789, + "learning_rate": 8.65206832296478e-06, + "loss": 0.84870267, + "num_input_tokens_seen": 406192464, + "router_z_loss_mlp": 0.14196777, + "step": 4899, + "time_per_iteration": 2.702162027359009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066084, + "balance_loss_mlp": 1.05165935, + "epoch": 0.9426702577914583, + "flos": 588559053312.0, + "grad_norm": 0.10016132550548382, + "language_loss": 0.79835558, + "learning_rate": 8.594457827702406e-06, + "loss": 0.80901635, + "num_input_tokens_seen": 406262640, + "router_z_loss_mlp": 0.14416504, + "step": 4900, + "time_per_iteration": 2.7979788780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072147, + "balance_loss_mlp": 1.05753195, + "epoch": 0.9428626394767218, + "flos": 616625021952.0, + "grad_norm": 0.08155692962699897, + "language_loss": 0.78633022, + "learning_rate": 8.537038112991114e-06, + "loss": 0.79705167, + "num_input_tokens_seen": 406341328, + "router_z_loss_mlp": 0.14611816, + "step": 4901, + "time_per_iteration": 2.8697218894958496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067206, + "balance_loss_mlp": 1.05312753, + "epoch": 0.9430550211619854, + "flos": 610410981888.0, + "grad_norm": 0.07461125959373652, + "language_loss": 0.8191936, + "learning_rate": 8.479809201123178e-06, + "loss": 0.82986569, + "num_input_tokens_seen": 406418864, + "router_z_loss_mlp": 0.140625, + "step": 4902, + "time_per_iteration": 2.756660223007202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070532, + "balance_loss_mlp": 1.05659688, + "epoch": 0.943247402847249, + "flos": 565990571520.0, + "grad_norm": 0.08855284632935614, + "language_loss": 0.78214121, + "learning_rate": 8.422771114316885e-06, + "loss": 0.79284656, + "num_input_tokens_seen": 406492320, + "router_z_loss_mlp": 0.1394043, + "step": 4903, + "time_per_iteration": 2.7230935096740723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068043, + "balance_loss_mlp": 1.05367839, + "epoch": 0.9434397845325125, + "flos": 527040483840.0, + "grad_norm": 0.07779943104118621, + "language_loss": 0.81681037, + "learning_rate": 8.365923874716297e-06, + "loss": 0.82749075, + "num_input_tokens_seen": 406560448, + "router_z_loss_mlp": 0.14343262, + "step": 4904, + "time_per_iteration": 2.6842496395111084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065765, + "balance_loss_mlp": 1.05143571, + "epoch": 0.943632166217776, + "flos": 593451214848.0, + "grad_norm": 0.0743178764901382, + "language_loss": 0.8264221, + "learning_rate": 8.309267504391593e-06, + "loss": 0.83707976, + "num_input_tokens_seen": 406631376, + "router_z_loss_mlp": 0.14318848, + "step": 4905, + "time_per_iteration": 2.7265892028808594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063706, + "balance_loss_mlp": 1.04915047, + "epoch": 0.9438245479030396, + "flos": 572770289664.0, + "grad_norm": 0.06559203485836985, + "language_loss": 0.85403311, + "learning_rate": 8.252802025338623e-06, + "loss": 0.86467016, + "num_input_tokens_seen": 406713728, + "router_z_loss_mlp": 0.14562988, + "step": 4906, + "time_per_iteration": 2.8402488231658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_mlp": 1.04754758, + "epoch": 0.9440169295883032, + "flos": 488258523648.0, + "grad_norm": 0.08035317225981296, + "language_loss": 0.81744617, + "learning_rate": 8.196527459479242e-06, + "loss": 0.82806635, + "num_input_tokens_seen": 406779168, + "router_z_loss_mlp": 0.14465332, + "step": 4907, + "time_per_iteration": 2.563253164291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061471, + "balance_loss_mlp": 1.04690337, + "epoch": 0.9442093112735668, + "flos": 731742999552.0, + "grad_norm": 0.06131941333469913, + "language_loss": 0.73863798, + "learning_rate": 8.140443828661137e-06, + "loss": 0.74925268, + "num_input_tokens_seen": 406860816, + "router_z_loss_mlp": 0.14550781, + "step": 4908, + "time_per_iteration": 3.048938751220703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066394, + "balance_loss_mlp": 1.05223215, + "epoch": 0.9444016929588304, + "flos": 571031036928.0, + "grad_norm": 0.1315206917141544, + "language_loss": 0.82031131, + "learning_rate": 8.084551154658004e-06, + "loss": 0.83097517, + "num_input_tokens_seen": 406929888, + "router_z_loss_mlp": 0.14172363, + "step": 4909, + "time_per_iteration": 2.698793411254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066845, + "balance_loss_mlp": 1.05251575, + "epoch": 0.9445940746440938, + "flos": 509292582912.0, + "grad_norm": 0.06866665177014267, + "language_loss": 0.85794264, + "learning_rate": 8.028849459169318e-06, + "loss": 0.86861104, + "num_input_tokens_seen": 406998224, + "router_z_loss_mlp": 0.14318848, + "step": 4910, + "time_per_iteration": 2.5939254760742188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069973, + "balance_loss_mlp": 1.05559599, + "epoch": 0.9447864563293574, + "flos": 624556293120.0, + "grad_norm": 0.07303339072359728, + "language_loss": 0.80941725, + "learning_rate": 7.97333876382028e-06, + "loss": 0.820117, + "num_input_tokens_seen": 407075088, + "router_z_loss_mlp": 0.14355469, + "step": 4911, + "time_per_iteration": 2.874375820159912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066266, + "balance_loss_mlp": 1.05161524, + "epoch": 0.944978838014621, + "flos": 505270047744.0, + "grad_norm": 0.06964507241753962, + "language_loss": 0.80899501, + "learning_rate": 7.918019090162098e-06, + "loss": 0.81965774, + "num_input_tokens_seen": 407147792, + "router_z_loss_mlp": 0.14648438, + "step": 4912, + "time_per_iteration": 2.760795831680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008957, + "balance_loss_mlp": 1.0027107, + "epoch": 0.9451712196998846, + "flos": 1484205451776.0, + "grad_norm": 0.006122085341861952, + "language_loss": 0.78287339, + "learning_rate": 7.862890459671812e-06, + "loss": 0.79296297, + "num_input_tokens_seen": 407387216, + "router_z_loss_mlp": 0.06225586, + "step": 4913, + "time_per_iteration": 5.09798526763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067638, + "balance_loss_mlp": 1.05330908, + "epoch": 0.9453636013851482, + "flos": 521137732608.0, + "grad_norm": 0.07285167198538633, + "language_loss": 0.90140414, + "learning_rate": 7.80795289375219e-06, + "loss": 0.91208053, + "num_input_tokens_seen": 407457664, + "router_z_loss_mlp": 0.14343262, + "step": 4914, + "time_per_iteration": 2.6756272315979004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008919, + "balance_loss_mlp": 1.00267243, + "epoch": 0.9455559830704117, + "flos": 1496902975488.0, + "grad_norm": 0.006122840187546539, + "language_loss": 0.8356235, + "learning_rate": 7.75320641373195e-06, + "loss": 0.84571272, + "num_input_tokens_seen": 407700256, + "router_z_loss_mlp": 0.06225586, + "step": 4915, + "time_per_iteration": 4.950132846832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067573, + "balance_loss_mlp": 1.05333984, + "epoch": 0.9457483647556753, + "flos": 498126910464.0, + "grad_norm": 0.07125264775294483, + "language_loss": 0.81883103, + "learning_rate": 7.698651040865534e-06, + "loss": 0.82950681, + "num_input_tokens_seen": 407770080, + "router_z_loss_mlp": 0.14233398, + "step": 4916, + "time_per_iteration": 2.6505517959594727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060955, + "balance_loss_mlp": 1.04641187, + "epoch": 0.9459407464409388, + "flos": 1019405979648.0, + "grad_norm": 0.06098908339015476, + "language_loss": 0.8214764, + "learning_rate": 7.644286796333222e-06, + "loss": 0.83208597, + "num_input_tokens_seen": 407854640, + "router_z_loss_mlp": 0.14526367, + "step": 4917, + "time_per_iteration": 3.3748598098754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068013, + "balance_loss_mlp": 1.05392301, + "epoch": 0.9461331281262024, + "flos": 513589330944.0, + "grad_norm": 0.11117653680643763, + "language_loss": 0.81199044, + "learning_rate": 7.590113701241075e-06, + "loss": 0.82267058, + "num_input_tokens_seen": 407922704, + "router_z_loss_mlp": 0.14099121, + "step": 4918, + "time_per_iteration": 2.619296073913574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066258, + "balance_loss_mlp": 1.05184555, + "epoch": 0.9463255098114659, + "flos": 528023909376.0, + "grad_norm": 0.09091182296398484, + "language_loss": 0.78226775, + "learning_rate": 7.536131776620936e-06, + "loss": 0.79293031, + "num_input_tokens_seen": 407991136, + "router_z_loss_mlp": 0.1439209, + "step": 4919, + "time_per_iteration": 2.5946567058563232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066597, + "balance_loss_mlp": 1.05230427, + "epoch": 0.9465178914967295, + "flos": 506043500544.0, + "grad_norm": 0.09912025388062279, + "language_loss": 0.83234036, + "learning_rate": 7.482341043430485e-06, + "loss": 0.84300637, + "num_input_tokens_seen": 408056576, + "router_z_loss_mlp": 0.14294434, + "step": 4920, + "time_per_iteration": 2.630028486251831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060019, + "balance_loss_mlp": 1.0456301, + "epoch": 0.9467102731819931, + "flos": 660254727168.0, + "grad_norm": 0.07221122100857683, + "language_loss": 0.8528769, + "learning_rate": 7.428741522553184e-06, + "loss": 0.86347711, + "num_input_tokens_seen": 408136960, + "router_z_loss_mlp": 0.14379883, + "step": 4921, + "time_per_iteration": 2.894165277481079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063149, + "balance_loss_mlp": 1.0486412, + "epoch": 0.9469026548672567, + "flos": 675183403008.0, + "grad_norm": 0.06500736705872397, + "language_loss": 0.89518285, + "learning_rate": 7.375333234798054e-06, + "loss": 0.90581435, + "num_input_tokens_seen": 408218304, + "router_z_loss_mlp": 0.14489746, + "step": 4922, + "time_per_iteration": 2.938701629638672 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061564, + "balance_loss_mlp": 1.04743826, + "epoch": 0.9470950365525203, + "flos": 513964859904.0, + "grad_norm": 0.07426983917980619, + "language_loss": 0.79634815, + "learning_rate": 7.32211620090012e-06, + "loss": 0.80696386, + "num_input_tokens_seen": 408287936, + "router_z_loss_mlp": 0.14135742, + "step": 4923, + "time_per_iteration": 2.650129556655884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066205, + "balance_loss_mlp": 1.05188811, + "epoch": 0.9472874182377837, + "flos": 550103063040.0, + "grad_norm": 0.06509608492345216, + "language_loss": 0.81173092, + "learning_rate": 7.269090441520132e-06, + "loss": 0.822393, + "num_input_tokens_seen": 408365568, + "router_z_loss_mlp": 0.14318848, + "step": 4924, + "time_per_iteration": 2.8149211406707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066395, + "balance_loss_mlp": 1.05208969, + "epoch": 0.9474797999230473, + "flos": 542769776640.0, + "grad_norm": 0.06782513775303885, + "language_loss": 0.80087507, + "learning_rate": 7.216255977244457e-06, + "loss": 0.81153905, + "num_input_tokens_seen": 408431248, + "router_z_loss_mlp": 0.14294434, + "step": 4925, + "time_per_iteration": 2.7232677936553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066702, + "balance_loss_mlp": 1.05213428, + "epoch": 0.9476721816083109, + "flos": 844644879360.0, + "grad_norm": 0.07485474272112004, + "language_loss": 0.85697073, + "learning_rate": 7.163612828585242e-06, + "loss": 0.86763775, + "num_input_tokens_seen": 408514112, + "router_z_loss_mlp": 0.14562988, + "step": 4926, + "time_per_iteration": 3.1124749183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106154, + "balance_loss_mlp": 1.0473187, + "epoch": 0.9478645632935745, + "flos": 638002676736.0, + "grad_norm": 0.07642573130807323, + "language_loss": 0.79089957, + "learning_rate": 7.1111610159803605e-06, + "loss": 0.80151492, + "num_input_tokens_seen": 408585968, + "router_z_loss_mlp": 0.14233398, + "step": 4927, + "time_per_iteration": 2.7843739986419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_mlp": 1.05232859, + "epoch": 0.948056944978838, + "flos": 656832748032.0, + "grad_norm": 0.06890136753931456, + "language_loss": 0.75879681, + "learning_rate": 7.058900559793469e-06, + "loss": 0.7694627, + "num_input_tokens_seen": 408665456, + "router_z_loss_mlp": 0.1427002, + "step": 4928, + "time_per_iteration": 2.831721544265747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064108, + "balance_loss_mlp": 1.04976702, + "epoch": 0.9482493266641016, + "flos": 440907660288.0, + "grad_norm": 0.07279210234186714, + "language_loss": 0.83387977, + "learning_rate": 7.00683148031378e-06, + "loss": 0.84452081, + "num_input_tokens_seen": 408730192, + "router_z_loss_mlp": 0.14318848, + "step": 4929, + "time_per_iteration": 2.5189461708068848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065665, + "balance_loss_mlp": 1.05141914, + "epoch": 0.9484417083493651, + "flos": 545989123584.0, + "grad_norm": 0.10355473964413647, + "language_loss": 0.78032148, + "learning_rate": 6.9549537977564024e-06, + "loss": 0.79097813, + "num_input_tokens_seen": 408807616, + "router_z_loss_mlp": 0.14245605, + "step": 4930, + "time_per_iteration": 2.784816265106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_mlp": 1.05073011, + "epoch": 0.9486340900346287, + "flos": 538598937600.0, + "grad_norm": 0.05876135972257936, + "language_loss": 0.79680765, + "learning_rate": 6.903267532262003e-06, + "loss": 0.80745637, + "num_input_tokens_seen": 408883552, + "router_z_loss_mlp": 0.14123535, + "step": 4931, + "time_per_iteration": 2.7050349712371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064154, + "balance_loss_mlp": 1.04932451, + "epoch": 0.9488264717198923, + "flos": 681669457920.0, + "grad_norm": 0.07900168224776632, + "language_loss": 0.8563565, + "learning_rate": 6.851772703896975e-06, + "loss": 0.86699808, + "num_input_tokens_seen": 408956400, + "router_z_loss_mlp": 0.14831543, + "step": 4932, + "time_per_iteration": 2.827993631362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064277, + "balance_loss_mlp": 1.04999626, + "epoch": 0.9490188534051558, + "flos": 462603944448.0, + "grad_norm": 0.08240763026965599, + "language_loss": 0.87754375, + "learning_rate": 6.8004693326533805e-06, + "loss": 0.88818657, + "num_input_tokens_seen": 409019904, + "router_z_loss_mlp": 0.14257812, + "step": 4933, + "time_per_iteration": 2.552738666534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064417, + "balance_loss_mlp": 1.05015934, + "epoch": 0.9492112350904194, + "flos": 543135393792.0, + "grad_norm": 0.06884328604621799, + "language_loss": 0.8272537, + "learning_rate": 6.7493574384489e-06, + "loss": 0.8378979, + "num_input_tokens_seen": 409094288, + "router_z_loss_mlp": 0.1427002, + "step": 4934, + "time_per_iteration": 2.688225269317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105915, + "balance_loss_mlp": 1.04498768, + "epoch": 0.949403616775683, + "flos": 550322947584.0, + "grad_norm": 0.14362869726847521, + "language_loss": 0.84435534, + "learning_rate": 6.698437041126992e-06, + "loss": 0.85494685, + "num_input_tokens_seen": 409169120, + "router_z_loss_mlp": 0.14172363, + "step": 4935, + "time_per_iteration": 2.7325098514556885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106501, + "balance_loss_mlp": 1.05078828, + "epoch": 0.9495959984609466, + "flos": 598383023616.0, + "grad_norm": 0.08023842745179113, + "language_loss": 0.82742482, + "learning_rate": 6.647708160456678e-06, + "loss": 0.83807492, + "num_input_tokens_seen": 409243200, + "router_z_loss_mlp": 0.14208984, + "step": 4936, + "time_per_iteration": 2.731147289276123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063563, + "balance_loss_mlp": 1.04942477, + "epoch": 0.94978838014621, + "flos": 608409626112.0, + "grad_norm": 0.07231560541851297, + "language_loss": 0.81890976, + "learning_rate": 6.597170816132702e-06, + "loss": 0.82954538, + "num_input_tokens_seen": 409319264, + "router_z_loss_mlp": 0.14135742, + "step": 4937, + "time_per_iteration": 2.8009979724884033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068576, + "balance_loss_mlp": 1.05419946, + "epoch": 0.9499807618314736, + "flos": 540832660992.0, + "grad_norm": 0.06879657649431303, + "language_loss": 0.86649179, + "learning_rate": 6.546825027775427e-06, + "loss": 0.8771776, + "num_input_tokens_seen": 409389840, + "router_z_loss_mlp": 0.14379883, + "step": 4938, + "time_per_iteration": 2.6949267387390137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065018, + "balance_loss_mlp": 1.0509038, + "epoch": 0.9501731435167372, + "flos": 594600196608.0, + "grad_norm": 0.066496386986475, + "language_loss": 0.8279618, + "learning_rate": 6.496670814930717e-06, + "loss": 0.83861196, + "num_input_tokens_seen": 409458752, + "router_z_loss_mlp": 0.14123535, + "step": 4939, + "time_per_iteration": 2.7675018310546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065861, + "balance_loss_mlp": 1.05160344, + "epoch": 0.9503655252020008, + "flos": 454138928640.0, + "grad_norm": 0.07552307901260344, + "language_loss": 0.79926252, + "learning_rate": 6.446708197070161e-06, + "loss": 0.80992115, + "num_input_tokens_seen": 409525008, + "router_z_loss_mlp": 0.14245605, + "step": 4940, + "time_per_iteration": 2.569943904876709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106462, + "balance_loss_mlp": 1.05030322, + "epoch": 0.9505579068872644, + "flos": 667944092160.0, + "grad_norm": 0.07875850796972751, + "language_loss": 0.84661138, + "learning_rate": 6.396937193591079e-06, + "loss": 0.8572576, + "num_input_tokens_seen": 409603376, + "router_z_loss_mlp": 0.14294434, + "step": 4941, + "time_per_iteration": 2.777996301651001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_mlp": 1.0555886, + "epoch": 0.9507502885725279, + "flos": 402207192576.0, + "grad_norm": 0.10996264625691853, + "language_loss": 0.81954122, + "learning_rate": 6.347357823816235e-06, + "loss": 0.8302412, + "num_input_tokens_seen": 409667168, + "router_z_loss_mlp": 0.14416504, + "step": 4942, + "time_per_iteration": 2.4901835918426514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064524, + "balance_loss_mlp": 1.04971838, + "epoch": 0.9509426702577914, + "flos": 700358565888.0, + "grad_norm": 0.15087682626916998, + "language_loss": 0.79449248, + "learning_rate": 6.297970106994011e-06, + "loss": 0.80513769, + "num_input_tokens_seen": 409746832, + "router_z_loss_mlp": 0.14782715, + "step": 4943, + "time_per_iteration": 2.989339828491211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061962, + "balance_loss_mlp": 1.04763293, + "epoch": 0.951135051943055, + "flos": 501415640064.0, + "grad_norm": 0.06965216151492816, + "language_loss": 0.82372928, + "learning_rate": 6.2487740622985126e-06, + "loss": 0.83434892, + "num_input_tokens_seen": 409813792, + "router_z_loss_mlp": 0.14318848, + "step": 4944, + "time_per_iteration": 2.577893018722534 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062657, + "balance_loss_mlp": 1.0486021, + "epoch": 0.9513274336283186, + "flos": 614621094912.0, + "grad_norm": 0.08955037755265657, + "language_loss": 0.81476396, + "learning_rate": 6.1997697088292395e-06, + "loss": 0.82539052, + "num_input_tokens_seen": 409898848, + "router_z_loss_mlp": 0.140625, + "step": 4945, + "time_per_iteration": 2.924196720123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062106, + "balance_loss_mlp": 1.04747915, + "epoch": 0.9515198153135821, + "flos": 519586057728.0, + "grad_norm": 0.10020891830615615, + "language_loss": 0.81823802, + "learning_rate": 6.150957065611363e-06, + "loss": 0.82885909, + "num_input_tokens_seen": 409966368, + "router_z_loss_mlp": 0.14599609, + "step": 4946, + "time_per_iteration": 2.582261800765991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063398, + "balance_loss_mlp": 1.04886687, + "epoch": 0.9517121969988457, + "flos": 664954168320.0, + "grad_norm": 0.06854803773952908, + "language_loss": 0.76341254, + "learning_rate": 6.102336151595667e-06, + "loss": 0.77404654, + "num_input_tokens_seen": 410048496, + "router_z_loss_mlp": 0.1451416, + "step": 4947, + "time_per_iteration": 2.958282947540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063649, + "balance_loss_mlp": 1.04943955, + "epoch": 0.9519045786841093, + "flos": 676409107968.0, + "grad_norm": 0.0768160436217087, + "language_loss": 0.7590248, + "learning_rate": 6.053906985658553e-06, + "loss": 0.76966131, + "num_input_tokens_seen": 410121840, + "router_z_loss_mlp": 0.14208984, + "step": 4948, + "time_per_iteration": 2.82889986038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_mlp": 1.05157638, + "epoch": 0.9520969603693729, + "flos": 652901617152.0, + "grad_norm": 0.08458305561651724, + "language_loss": 0.8030057, + "learning_rate": 6.005669586601814e-06, + "loss": 0.81366104, + "num_input_tokens_seen": 410199152, + "router_z_loss_mlp": 0.13977051, + "step": 4949, + "time_per_iteration": 2.864802122116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066868, + "balance_loss_mlp": 1.05283666, + "epoch": 0.9522893420546364, + "flos": 743284200960.0, + "grad_norm": 0.06703168985120538, + "language_loss": 0.83432692, + "learning_rate": 5.957623973152748e-06, + "loss": 0.84499562, + "num_input_tokens_seen": 410285392, + "router_z_loss_mlp": 0.14050293, + "step": 4950, + "time_per_iteration": 3.04030179977417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062922, + "balance_loss_mlp": 1.04827094, + "epoch": 0.9524817237398999, + "flos": 761696898048.0, + "grad_norm": 0.07636925944960744, + "language_loss": 0.80493855, + "learning_rate": 5.909770163964545e-06, + "loss": 0.81556773, + "num_input_tokens_seen": 410359872, + "router_z_loss_mlp": 0.14624023, + "step": 4951, + "time_per_iteration": 3.015068292617798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062355, + "balance_loss_mlp": 1.04806209, + "epoch": 0.9526741054251635, + "flos": 529125903360.0, + "grad_norm": 0.09924230241420891, + "language_loss": 0.82117671, + "learning_rate": 5.8621081776155105e-06, + "loss": 0.83180022, + "num_input_tokens_seen": 410425728, + "router_z_loss_mlp": 0.14294434, + "step": 4952, + "time_per_iteration": 2.6157262325286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065916, + "balance_loss_mlp": 1.0515151, + "epoch": 0.9528664871104271, + "flos": 488441332224.0, + "grad_norm": 0.0759427350712735, + "language_loss": 0.80944276, + "learning_rate": 5.814638032609787e-06, + "loss": 0.82010198, + "num_input_tokens_seen": 410496080, + "router_z_loss_mlp": 0.1439209, + "step": 4953, + "time_per_iteration": 2.6230878829956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065708, + "balance_loss_mlp": 1.05156994, + "epoch": 0.9530588687956907, + "flos": 517745115648.0, + "grad_norm": 0.06500419189537737, + "language_loss": 0.85041642, + "learning_rate": 5.76735974737691e-06, + "loss": 0.86107355, + "num_input_tokens_seen": 410576448, + "router_z_loss_mlp": 0.14160156, + "step": 4954, + "time_per_iteration": 2.739537000656128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063265, + "balance_loss_mlp": 1.04874492, + "epoch": 0.9532512504809542, + "flos": 675148898304.0, + "grad_norm": 0.07799222064442642, + "language_loss": 0.8108077, + "learning_rate": 5.720273340271864e-06, + "loss": 0.82144034, + "num_input_tokens_seen": 410655792, + "router_z_loss_mlp": 0.14526367, + "step": 4955, + "time_per_iteration": 2.9021482467651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063887, + "balance_loss_mlp": 1.04912901, + "epoch": 0.9534436321662177, + "flos": 489523502592.0, + "grad_norm": 0.1050573421070645, + "language_loss": 0.84418821, + "learning_rate": 5.673378829575249e-06, + "loss": 0.85482705, + "num_input_tokens_seen": 410725440, + "router_z_loss_mlp": 0.14733887, + "step": 4956, + "time_per_iteration": 2.639496326446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064971, + "balance_loss_mlp": 1.05083311, + "epoch": 0.9536360138514813, + "flos": 496585147392.0, + "grad_norm": 0.07729665120468585, + "language_loss": 0.82151657, + "learning_rate": 5.626676233493167e-06, + "loss": 0.83216631, + "num_input_tokens_seen": 410797552, + "router_z_loss_mlp": 0.14135742, + "step": 4957, + "time_per_iteration": 2.636658191680908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062276, + "balance_loss_mlp": 1.04801905, + "epoch": 0.9538283955367449, + "flos": 801462283776.0, + "grad_norm": 0.07611939127300738, + "language_loss": 0.84039545, + "learning_rate": 5.580165570157114e-06, + "loss": 0.85101831, + "num_input_tokens_seen": 410876736, + "router_z_loss_mlp": 0.14257812, + "step": 4958, + "time_per_iteration": 3.0440261363983154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070141, + "balance_loss_mlp": 1.05581212, + "epoch": 0.9540207772220085, + "flos": 556668039168.0, + "grad_norm": 0.06159911525192777, + "language_loss": 0.79893637, + "learning_rate": 5.533846857624203e-06, + "loss": 0.80963778, + "num_input_tokens_seen": 410955632, + "router_z_loss_mlp": 0.14318848, + "step": 4959, + "time_per_iteration": 2.845899820327759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066785, + "balance_loss_mlp": 1.05197954, + "epoch": 0.954213158907272, + "flos": 684505935360.0, + "grad_norm": 0.06494855135175924, + "language_loss": 0.81421417, + "learning_rate": 5.487720113876882e-06, + "loss": 0.82488203, + "num_input_tokens_seen": 411038480, + "router_z_loss_mlp": 0.14782715, + "step": 4960, + "time_per_iteration": 2.9362258911132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065975, + "balance_loss_mlp": 1.05114579, + "epoch": 0.9544055405925356, + "flos": 535752548352.0, + "grad_norm": 0.07321222855176411, + "language_loss": 0.82439888, + "learning_rate": 5.441785356823214e-06, + "loss": 0.83505863, + "num_input_tokens_seen": 411109744, + "router_z_loss_mlp": 0.14819336, + "step": 4961, + "time_per_iteration": 2.7255775928497314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065276, + "balance_loss_mlp": 1.05088735, + "epoch": 0.9545979222777992, + "flos": 825404401152.0, + "grad_norm": 0.06731908344791544, + "language_loss": 0.80610138, + "learning_rate": 5.3960426042965476e-06, + "loss": 0.81675416, + "num_input_tokens_seen": 411202192, + "router_z_loss_mlp": 0.14379883, + "step": 4962, + "time_per_iteration": 3.1223695278167725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066006, + "balance_loss_mlp": 1.05186808, + "epoch": 0.9547903039630627, + "flos": 761691755520.0, + "grad_norm": 0.09400179333507447, + "language_loss": 0.77407354, + "learning_rate": 5.3504918740558405e-06, + "loss": 0.78473365, + "num_input_tokens_seen": 411289248, + "router_z_loss_mlp": 0.14147949, + "step": 4963, + "time_per_iteration": 3.0968358516693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068214, + "balance_loss_mlp": 1.05384946, + "epoch": 0.9549826856483262, + "flos": 515306562048.0, + "grad_norm": 0.0716769170625749, + "language_loss": 0.82514, + "learning_rate": 5.3051331837855045e-06, + "loss": 0.83582222, + "num_input_tokens_seen": 411355232, + "router_z_loss_mlp": 0.14379883, + "step": 4964, + "time_per_iteration": 2.6738553047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106483, + "balance_loss_mlp": 1.0503341, + "epoch": 0.9551750673335898, + "flos": 643107382272.0, + "grad_norm": 0.07735162593831964, + "language_loss": 0.82933629, + "learning_rate": 5.259966551095341e-06, + "loss": 0.83998454, + "num_input_tokens_seen": 411432288, + "router_z_loss_mlp": 0.14477539, + "step": 4965, + "time_per_iteration": 2.8790547847747803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060282, + "balance_loss_mlp": 1.04591751, + "epoch": 0.9553674490188534, + "flos": 472208030208.0, + "grad_norm": 0.07636041436284387, + "language_loss": 0.82715493, + "learning_rate": 5.214991993520546e-06, + "loss": 0.83775771, + "num_input_tokens_seen": 411499376, + "router_z_loss_mlp": 0.14367676, + "step": 4966, + "time_per_iteration": 2.5930259227752686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066923, + "balance_loss_mlp": 1.05236745, + "epoch": 0.955559830704117, + "flos": 528317945856.0, + "grad_norm": 0.08146247068647224, + "language_loss": 0.81637287, + "learning_rate": 5.170209528521763e-06, + "loss": 0.8270421, + "num_input_tokens_seen": 411564976, + "router_z_loss_mlp": 0.14526367, + "step": 4967, + "time_per_iteration": 2.599799633026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062549, + "balance_loss_mlp": 1.04821956, + "epoch": 0.9557522123893806, + "flos": 548168518656.0, + "grad_norm": 0.13345718857384153, + "language_loss": 0.84123564, + "learning_rate": 5.125619173485196e-06, + "loss": 0.85186112, + "num_input_tokens_seen": 411636464, + "router_z_loss_mlp": 0.14318848, + "step": 4968, + "time_per_iteration": 2.6432812213897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062693, + "balance_loss_mlp": 1.0483048, + "epoch": 0.955944594074644, + "flos": 509465479680.0, + "grad_norm": 0.05812411628153182, + "language_loss": 0.81737351, + "learning_rate": 5.08122094572222e-06, + "loss": 0.82800043, + "num_input_tokens_seen": 411710672, + "router_z_loss_mlp": 0.14379883, + "step": 4969, + "time_per_iteration": 2.738231897354126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064518, + "balance_loss_mlp": 1.05016541, + "epoch": 0.9561369757599076, + "flos": 527578997760.0, + "grad_norm": 0.11097368256932602, + "language_loss": 0.79432231, + "learning_rate": 5.037014862469824e-06, + "loss": 0.80496752, + "num_input_tokens_seen": 411785616, + "router_z_loss_mlp": 0.14355469, + "step": 4970, + "time_per_iteration": 2.7720186710357666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067002, + "balance_loss_mlp": 1.05264878, + "epoch": 0.9563293574451712, + "flos": 498201062400.0, + "grad_norm": 0.17337384612850415, + "language_loss": 0.80255437, + "learning_rate": 4.993000940890391e-06, + "loss": 0.81322438, + "num_input_tokens_seen": 411854832, + "router_z_loss_mlp": 0.14367676, + "step": 4971, + "time_per_iteration": 2.656113862991333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008577, + "balance_loss_mlp": 1.00233078, + "epoch": 0.9565217391304348, + "flos": 1408875628032.0, + "grad_norm": 0.004243585536307748, + "language_loss": 0.81773561, + "learning_rate": 4.949179198071585e-06, + "loss": 0.82782137, + "num_input_tokens_seen": 412081856, + "router_z_loss_mlp": 0.0625, + "step": 4972, + "time_per_iteration": 4.874886512756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060649, + "balance_loss_mlp": 1.04655886, + "epoch": 0.9567141208156984, + "flos": 503846853120.0, + "grad_norm": 0.06088753235658507, + "language_loss": 0.78254598, + "learning_rate": 4.905549651026464e-06, + "loss": 0.79315251, + "num_input_tokens_seen": 412155600, + "router_z_loss_mlp": 0.14086914, + "step": 4973, + "time_per_iteration": 2.7379391193389893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064278, + "balance_loss_mlp": 1.04990137, + "epoch": 0.9569065025009619, + "flos": 433213526016.0, + "grad_norm": 0.07892228707042209, + "language_loss": 0.79897404, + "learning_rate": 4.86211231669359e-06, + "loss": 0.8096168, + "num_input_tokens_seen": 412219584, + "router_z_loss_mlp": 0.14367676, + "step": 4974, + "time_per_iteration": 2.4719619750976562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066514, + "balance_loss_mlp": 1.05243528, + "epoch": 0.9570988841862255, + "flos": 589959853056.0, + "grad_norm": 0.07550242888066953, + "language_loss": 0.78395075, + "learning_rate": 4.818867211936806e-06, + "loss": 0.79461586, + "num_input_tokens_seen": 412295088, + "router_z_loss_mlp": 0.14086914, + "step": 4975, + "time_per_iteration": 2.7846784591674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106779, + "balance_loss_mlp": 1.05294812, + "epoch": 0.957291265871489, + "flos": 767278448640.0, + "grad_norm": 0.08411213981509691, + "language_loss": 0.78761947, + "learning_rate": 4.7758143535454045e-06, + "loss": 0.79829735, + "num_input_tokens_seen": 412376992, + "router_z_loss_mlp": 0.14831543, + "step": 4976, + "time_per_iteration": 2.9675724506378174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065509, + "balance_loss_mlp": 1.05140674, + "epoch": 0.9574836475567526, + "flos": 639104670720.0, + "grad_norm": 0.08804452126227069, + "language_loss": 0.84846663, + "learning_rate": 4.732953758233849e-06, + "loss": 0.8591218, + "num_input_tokens_seen": 412450064, + "router_z_loss_mlp": 0.14111328, + "step": 4977, + "time_per_iteration": 2.875070810317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0100856, + "balance_loss_mlp": 1.00231338, + "epoch": 0.9576760292420161, + "flos": 1575939649536.0, + "grad_norm": 0.004243788553748721, + "language_loss": 0.78607261, + "learning_rate": 4.690285442642272e-06, + "loss": 0.79615819, + "num_input_tokens_seen": 412676896, + "router_z_loss_mlp": 0.0625, + "step": 4978, + "time_per_iteration": 4.965006113052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061667, + "balance_loss_mlp": 1.04689729, + "epoch": 0.9578684109272797, + "flos": 496345439232.0, + "grad_norm": 0.06902169609028791, + "language_loss": 0.86979818, + "learning_rate": 4.6478094233358695e-06, + "loss": 0.88041484, + "num_input_tokens_seen": 412746848, + "router_z_loss_mlp": 0.14746094, + "step": 4979, + "time_per_iteration": 2.626277446746826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106511, + "balance_loss_mlp": 1.05060196, + "epoch": 0.9580607926125433, + "flos": 429954531840.0, + "grad_norm": 0.08534432456109216, + "language_loss": 0.85267627, + "learning_rate": 4.605525716805337e-06, + "loss": 0.86332732, + "num_input_tokens_seen": 412810144, + "router_z_loss_mlp": 0.14489746, + "step": 4980, + "time_per_iteration": 2.471359968185425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063533, + "balance_loss_mlp": 1.04926348, + "epoch": 0.9582531742978069, + "flos": 1127262251520.0, + "grad_norm": 0.07445475831229749, + "language_loss": 0.80369455, + "learning_rate": 4.563434339466599e-06, + "loss": 0.81432986, + "num_input_tokens_seen": 412904768, + "router_z_loss_mlp": 0.1427002, + "step": 4981, + "time_per_iteration": 3.5420055389404297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063708, + "balance_loss_mlp": 1.04910517, + "epoch": 0.9584455559830705, + "flos": 524458395648.0, + "grad_norm": 0.06322023271985078, + "language_loss": 0.7885139, + "learning_rate": 4.521535307661085e-06, + "loss": 0.799151, + "num_input_tokens_seen": 412974592, + "router_z_loss_mlp": 0.14575195, + "step": 4982, + "time_per_iteration": 2.6688125133514404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067117, + "balance_loss_mlp": 1.05241847, + "epoch": 0.9586379376683339, + "flos": 634187543040.0, + "grad_norm": 0.0657595850835073, + "language_loss": 0.8091737, + "learning_rate": 4.479828637655392e-06, + "loss": 0.81984484, + "num_input_tokens_seen": 413052848, + "router_z_loss_mlp": 0.14672852, + "step": 4983, + "time_per_iteration": 2.900023937225342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061997, + "balance_loss_mlp": 1.04760838, + "epoch": 0.9588303193535975, + "flos": 416061038592.0, + "grad_norm": 0.07308640529498234, + "language_loss": 0.8356294, + "learning_rate": 4.438314345641459e-06, + "loss": 0.84624934, + "num_input_tokens_seen": 413118000, + "router_z_loss_mlp": 0.14379883, + "step": 4984, + "time_per_iteration": 2.4941763877868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064414, + "balance_loss_mlp": 1.0499779, + "epoch": 0.9590227010388611, + "flos": 481683635712.0, + "grad_norm": 0.07297959005418315, + "language_loss": 0.78085732, + "learning_rate": 4.3969924477365585e-06, + "loss": 0.79150152, + "num_input_tokens_seen": 413185616, + "router_z_loss_mlp": 0.14416504, + "step": 4985, + "time_per_iteration": 2.574579954147339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063094, + "balance_loss_mlp": 1.0491817, + "epoch": 0.9592150827241247, + "flos": 684540440064.0, + "grad_norm": 0.0801232178302707, + "language_loss": 0.80204809, + "learning_rate": 4.355862959983359e-06, + "loss": 0.81267899, + "num_input_tokens_seen": 413265616, + "router_z_loss_mlp": 0.13916016, + "step": 4986, + "time_per_iteration": 2.9517881870269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060667, + "balance_loss_mlp": 1.0466485, + "epoch": 0.9594074644093882, + "flos": 574490092032.0, + "grad_norm": 0.06970674893323296, + "language_loss": 0.70871252, + "learning_rate": 4.314925898349642e-06, + "loss": 0.71931922, + "num_input_tokens_seen": 413341248, + "router_z_loss_mlp": 0.14025879, + "step": 4987, + "time_per_iteration": 2.7779877185821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062505, + "balance_loss_mlp": 1.04840255, + "epoch": 0.9595998460946518, + "flos": 546871233024.0, + "grad_norm": 0.0813412690105397, + "language_loss": 0.78303689, + "learning_rate": 4.2741812787286395e-06, + "loss": 0.79366195, + "num_input_tokens_seen": 413416080, + "router_z_loss_mlp": 0.14111328, + "step": 4988, + "time_per_iteration": 2.821676015853882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061716, + "balance_loss_mlp": 1.04733956, + "epoch": 0.9597922277799154, + "flos": 474043829760.0, + "grad_norm": 0.07674089772836457, + "language_loss": 0.78562862, + "learning_rate": 4.233629116938809e-06, + "loss": 0.79624575, + "num_input_tokens_seen": 413482336, + "router_z_loss_mlp": 0.14367676, + "step": 4989, + "time_per_iteration": 2.5946123600006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063896, + "balance_loss_mlp": 1.04971027, + "epoch": 0.9599846094651789, + "flos": 514691324928.0, + "grad_norm": 0.07443176706054339, + "language_loss": 0.8581894, + "learning_rate": 4.193269428723889e-06, + "loss": 0.86882842, + "num_input_tokens_seen": 413553248, + "router_z_loss_mlp": 0.14196777, + "step": 4990, + "time_per_iteration": 2.659696578979492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064136, + "balance_loss_mlp": 1.04966402, + "epoch": 0.9601769911504425, + "flos": 594983066112.0, + "grad_norm": 0.08548186890717813, + "language_loss": 0.78247094, + "learning_rate": 4.1531022297529035e-06, + "loss": 0.79311228, + "num_input_tokens_seen": 413625776, + "router_z_loss_mlp": 0.14477539, + "step": 4991, + "time_per_iteration": 2.785454034805298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106867, + "balance_loss_mlp": 1.05452025, + "epoch": 0.960369372835706, + "flos": 493012293120.0, + "grad_norm": 0.05933531431309235, + "language_loss": 0.79160237, + "learning_rate": 4.1131275356201536e-06, + "loss": 0.80228913, + "num_input_tokens_seen": 413693056, + "router_z_loss_mlp": 0.14160156, + "step": 4992, + "time_per_iteration": 2.6056604385375977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106865, + "balance_loss_mlp": 1.05430889, + "epoch": 0.9605617545209696, + "flos": 579293420544.0, + "grad_norm": 0.07427096992859433, + "language_loss": 0.82677233, + "learning_rate": 4.073345361845171e-06, + "loss": 0.83745885, + "num_input_tokens_seen": 413765616, + "router_z_loss_mlp": 0.14343262, + "step": 4993, + "time_per_iteration": 2.6981287002563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066158, + "balance_loss_mlp": 1.05191231, + "epoch": 0.9607541362062332, + "flos": 927708857856.0, + "grad_norm": 0.05635650765787246, + "language_loss": 0.86224592, + "learning_rate": 4.033755723872767e-06, + "loss": 0.87290752, + "num_input_tokens_seen": 413850976, + "router_z_loss_mlp": 0.14245605, + "step": 4994, + "time_per_iteration": 3.2580032348632812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069237, + "balance_loss_mlp": 1.0547291, + "epoch": 0.9609465178914968, + "flos": 573121225728.0, + "grad_norm": 0.06976410259133954, + "language_loss": 0.75687838, + "learning_rate": 3.994358637073036e-06, + "loss": 0.76757073, + "num_input_tokens_seen": 413931648, + "router_z_loss_mlp": 0.14501953, + "step": 4995, + "time_per_iteration": 2.8269472122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064304, + "balance_loss_mlp": 1.04968929, + "epoch": 0.9611388995767602, + "flos": 530850475008.0, + "grad_norm": 0.1775846949415705, + "language_loss": 0.85502684, + "learning_rate": 3.955154116741244e-06, + "loss": 0.86566985, + "num_input_tokens_seen": 414003216, + "router_z_loss_mlp": 0.14599609, + "step": 4996, + "time_per_iteration": 2.6431405544281006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062028, + "balance_loss_mlp": 1.0472939, + "epoch": 0.9613312812620238, + "flos": 646247808000.0, + "grad_norm": 0.07750261021138917, + "language_loss": 0.81917465, + "learning_rate": 3.916142178097881e-06, + "loss": 0.82979488, + "num_input_tokens_seen": 414077072, + "router_z_loss_mlp": 0.1472168, + "step": 4997, + "time_per_iteration": 2.7661595344543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106511, + "balance_loss_mlp": 1.05091262, + "epoch": 0.9615236629472874, + "flos": 496152718848.0, + "grad_norm": 0.0672683260199148, + "language_loss": 0.77680969, + "learning_rate": 3.877322836288888e-06, + "loss": 0.7874608, + "num_input_tokens_seen": 414157600, + "router_z_loss_mlp": 0.14208984, + "step": 4998, + "time_per_iteration": 2.8887362480163574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065659, + "balance_loss_mlp": 1.05123484, + "epoch": 0.961716044632551, + "flos": 512974093824.0, + "grad_norm": 0.0662764042679711, + "language_loss": 0.75444281, + "learning_rate": 3.838696106385153e-06, + "loss": 0.76509941, + "num_input_tokens_seen": 414224880, + "router_z_loss_mlp": 0.14428711, + "step": 4999, + "time_per_iteration": 2.6276803016662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066483, + "balance_loss_mlp": 1.05201101, + "epoch": 0.9619084263178146, + "flos": 501084527616.0, + "grad_norm": 0.07205618121733878, + "language_loss": 0.80739886, + "learning_rate": 3.800262003382904e-06, + "loss": 0.81806368, + "num_input_tokens_seen": 414291728, + "router_z_loss_mlp": 0.14453125, + "step": 5000, + "time_per_iteration": 2.603367567062378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069579, + "balance_loss_mlp": 1.05550003, + "epoch": 0.9621008080030781, + "flos": 595635379200.0, + "grad_norm": 0.07888618565398942, + "language_loss": 0.74628067, + "learning_rate": 3.7620205422035923e-06, + "loss": 0.75697649, + "num_input_tokens_seen": 414369568, + "router_z_loss_mlp": 0.14074707, + "step": 5001, + "time_per_iteration": 2.7618119716644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067961, + "balance_loss_mlp": 1.05290532, + "epoch": 0.9622931896883417, + "flos": 502250761728.0, + "grad_norm": 0.10101824369057563, + "language_loss": 0.82141006, + "learning_rate": 3.723971737693899e-06, + "loss": 0.8320896, + "num_input_tokens_seen": 414441424, + "router_z_loss_mlp": 0.15026855, + "step": 5002, + "time_per_iteration": 2.6244583129882812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063578, + "balance_loss_mlp": 1.04942787, + "epoch": 0.9624855713736052, + "flos": 607287808512.0, + "grad_norm": 0.08187350631262881, + "language_loss": 0.80840087, + "learning_rate": 3.6861156046256728e-06, + "loss": 0.81903666, + "num_input_tokens_seen": 414512960, + "router_z_loss_mlp": 0.14160156, + "step": 5003, + "time_per_iteration": 2.8215861320495605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060243, + "balance_loss_mlp": 1.04593801, + "epoch": 0.9626779530588688, + "flos": 510715777536.0, + "grad_norm": 0.10727098163709863, + "language_loss": 0.84822023, + "learning_rate": 3.648452157695936e-06, + "loss": 0.8588227, + "num_input_tokens_seen": 414577392, + "router_z_loss_mlp": 0.14306641, + "step": 5004, + "time_per_iteration": 2.6208014488220215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064426, + "balance_loss_mlp": 1.05009699, + "epoch": 0.9628703347441323, + "flos": 627294025728.0, + "grad_norm": 0.06354974435142602, + "language_loss": 0.82661265, + "learning_rate": 3.610981411526937e-06, + "loss": 0.83725691, + "num_input_tokens_seen": 414655152, + "router_z_loss_mlp": 0.14331055, + "step": 5005, + "time_per_iteration": 2.8532235622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067404, + "balance_loss_mlp": 1.05277693, + "epoch": 0.9630627164293959, + "flos": 630758223360.0, + "grad_norm": 0.08206220498729579, + "language_loss": 0.77569473, + "learning_rate": 3.573703380666149e-06, + "loss": 0.78636873, + "num_input_tokens_seen": 414730432, + "router_z_loss_mlp": 0.14611816, + "step": 5006, + "time_per_iteration": 2.7677853107452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066717, + "balance_loss_mlp": 1.05243576, + "epoch": 0.9632550981146595, + "flos": 570558961152.0, + "grad_norm": 0.062257883589972376, + "language_loss": 0.78452492, + "learning_rate": 3.5366180795861622e-06, + "loss": 0.79519212, + "num_input_tokens_seen": 414810688, + "router_z_loss_mlp": 0.1427002, + "step": 5007, + "time_per_iteration": 2.869426965713501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065378, + "balance_loss_mlp": 1.0506556, + "epoch": 0.9634474797999231, + "flos": 466117327872.0, + "grad_norm": 0.07554566875409159, + "language_loss": 0.81106812, + "learning_rate": 3.4997255226847937e-06, + "loss": 0.82172191, + "num_input_tokens_seen": 414880544, + "router_z_loss_mlp": 0.14709473, + "step": 5008, + "time_per_iteration": 2.672776460647583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062497, + "balance_loss_mlp": 1.04800081, + "epoch": 0.9636398614851867, + "flos": 526600714752.0, + "grad_norm": 0.08237727119905165, + "language_loss": 0.85430717, + "learning_rate": 3.463025724284974e-06, + "loss": 0.86493218, + "num_input_tokens_seen": 414949920, + "router_z_loss_mlp": 0.14489746, + "step": 5009, + "time_per_iteration": 2.628169536590576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062003, + "balance_loss_mlp": 1.04785311, + "epoch": 0.9638322431704501, + "flos": 564831677952.0, + "grad_norm": 0.07717205590694699, + "language_loss": 0.75397646, + "learning_rate": 3.4265186986348618e-06, + "loss": 0.76459646, + "num_input_tokens_seen": 415024288, + "router_z_loss_mlp": 0.14160156, + "step": 5010, + "time_per_iteration": 2.793161153793335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064984, + "balance_loss_mlp": 1.05066681, + "epoch": 0.9640246248557137, + "flos": 477772328448.0, + "grad_norm": 0.06549027806244842, + "language_loss": 0.84361243, + "learning_rate": 3.3902044599076754e-06, + "loss": 0.85426223, + "num_input_tokens_seen": 415092032, + "router_z_loss_mlp": 0.14294434, + "step": 5011, + "time_per_iteration": 2.5983877182006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065273, + "balance_loss_mlp": 1.05087256, + "epoch": 0.9642170065409773, + "flos": 539318062080.0, + "grad_norm": 0.0814335313038714, + "language_loss": 0.88396895, + "learning_rate": 3.354083022201859e-06, + "loss": 0.89462173, + "num_input_tokens_seen": 415158544, + "router_z_loss_mlp": 0.14379883, + "step": 5012, + "time_per_iteration": 2.624086618423462 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064149, + "balance_loss_mlp": 1.0500232, + "epoch": 0.9644093882262409, + "flos": 523754325504.0, + "grad_norm": 0.06891136227810866, + "language_loss": 0.83706915, + "learning_rate": 3.3181543995410843e-06, + "loss": 0.84771073, + "num_input_tokens_seen": 415225088, + "router_z_loss_mlp": 0.14123535, + "step": 5013, + "time_per_iteration": 2.6089134216308594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064782, + "balance_loss_mlp": 1.05102515, + "epoch": 0.9646017699115044, + "flos": 574290031104.0, + "grad_norm": 0.08948663754721935, + "language_loss": 0.78484344, + "learning_rate": 3.2824186058740268e-06, + "loss": 0.79549122, + "num_input_tokens_seen": 415300224, + "router_z_loss_mlp": 0.13781738, + "step": 5014, + "time_per_iteration": 2.7396328449249268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068726, + "balance_loss_mlp": 1.0543493, + "epoch": 0.964794151596768, + "flos": 636799366656.0, + "grad_norm": 0.08470575991353577, + "language_loss": 0.84187967, + "learning_rate": 3.246875655074588e-06, + "loss": 0.85256696, + "num_input_tokens_seen": 415368784, + "router_z_loss_mlp": 0.14355469, + "step": 5015, + "time_per_iteration": 2.7370247840881348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066055, + "balance_loss_mlp": 1.05163109, + "epoch": 0.9649865332820315, + "flos": 617435550720.0, + "grad_norm": 0.07318591973033871, + "language_loss": 0.86297971, + "learning_rate": 3.211525560941675e-06, + "loss": 0.87364024, + "num_input_tokens_seen": 415440752, + "router_z_loss_mlp": 0.14404297, + "step": 5016, + "time_per_iteration": 2.7685937881469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063409, + "balance_loss_mlp": 1.04893649, + "epoch": 0.9651789149672951, + "flos": 516183528960.0, + "grad_norm": 0.05982655632152984, + "language_loss": 0.81268066, + "learning_rate": 3.1763683371994754e-06, + "loss": 0.82331479, + "num_input_tokens_seen": 415516128, + "router_z_loss_mlp": 0.14465332, + "step": 5017, + "time_per_iteration": 2.810807228088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064085, + "balance_loss_mlp": 1.05006623, + "epoch": 0.9653712966525587, + "flos": 492940712448.0, + "grad_norm": 0.12263937824557229, + "language_loss": 0.80021322, + "learning_rate": 3.1414039974972385e-06, + "loss": 0.81085408, + "num_input_tokens_seen": 415583744, + "router_z_loss_mlp": 0.14038086, + "step": 5018, + "time_per_iteration": 2.5756077766418457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063054, + "balance_loss_mlp": 1.04867768, + "epoch": 0.9655636783378222, + "flos": 536560505856.0, + "grad_norm": 0.06991309363729381, + "language_loss": 0.82447302, + "learning_rate": 3.106632555409328e-06, + "loss": 0.83510351, + "num_input_tokens_seen": 415659856, + "router_z_loss_mlp": 0.14355469, + "step": 5019, + "time_per_iteration": 2.784034013748169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064572, + "balance_loss_mlp": 1.05060053, + "epoch": 0.9657560600230858, + "flos": 459023749632.0, + "grad_norm": 0.06912097229902868, + "language_loss": 0.82277477, + "learning_rate": 3.072054024435167e-06, + "loss": 0.83342046, + "num_input_tokens_seen": 415731792, + "router_z_loss_mlp": 0.13977051, + "step": 5020, + "time_per_iteration": 2.6418702602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065145, + "balance_loss_mlp": 1.05097127, + "epoch": 0.9659484417083494, + "flos": 686178749952.0, + "grad_norm": 0.09913081128691836, + "language_loss": 0.83020145, + "learning_rate": 3.0376684179994064e-06, + "loss": 0.84085286, + "num_input_tokens_seen": 415809536, + "router_z_loss_mlp": 0.14160156, + "step": 5021, + "time_per_iteration": 2.809098482131958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007982, + "balance_loss_mlp": 1.00173593, + "epoch": 0.966140823393613, + "flos": 1502292178944.0, + "grad_norm": 0.003272670931099022, + "language_loss": 0.80694246, + "learning_rate": 3.0034757494516453e-06, + "loss": 0.81702226, + "num_input_tokens_seen": 416027600, + "router_z_loss_mlp": 0.0625, + "step": 5022, + "time_per_iteration": 4.741684198379517 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067318, + "balance_loss_mlp": 1.0534184, + "epoch": 0.9663332050788765, + "flos": 464899336704.0, + "grad_norm": 0.09278307855345254, + "language_loss": 0.81116998, + "learning_rate": 2.9694760320667093e-06, + "loss": 0.82184321, + "num_input_tokens_seen": 416096128, + "router_z_loss_mlp": 0.13916016, + "step": 5023, + "time_per_iteration": 2.5895280838012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064931, + "balance_loss_mlp": 1.05078137, + "epoch": 0.96652558676414, + "flos": 500834907648.0, + "grad_norm": 0.08225049936543592, + "language_loss": 0.85264218, + "learning_rate": 2.9356692790444283e-06, + "loss": 0.8632915, + "num_input_tokens_seen": 416164256, + "router_z_loss_mlp": 0.14147949, + "step": 5024, + "time_per_iteration": 2.6449408531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106441, + "balance_loss_mlp": 1.04997373, + "epoch": 0.9667179684494036, + "flos": 424839914496.0, + "grad_norm": 0.098917313378826, + "language_loss": 0.82924014, + "learning_rate": 2.9020555035097484e-06, + "loss": 0.83988422, + "num_input_tokens_seen": 416227296, + "router_z_loss_mlp": 0.14440918, + "step": 5025, + "time_per_iteration": 2.458395481109619 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066655, + "balance_loss_mlp": 1.05213523, + "epoch": 0.9669103501346672, + "flos": 516996628992.0, + "grad_norm": 0.09352035120498775, + "language_loss": 0.85764629, + "learning_rate": 2.8686347185127305e-06, + "loss": 0.86831284, + "num_input_tokens_seen": 416297184, + "router_z_loss_mlp": 0.1451416, + "step": 5026, + "time_per_iteration": 2.673778533935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066519, + "balance_loss_mlp": 1.0520227, + "epoch": 0.9671027318199308, + "flos": 456241600512.0, + "grad_norm": 0.091795487168474, + "language_loss": 0.75505573, + "learning_rate": 2.8354069370284396e-06, + "loss": 0.76572096, + "num_input_tokens_seen": 416363056, + "router_z_loss_mlp": 0.14489746, + "step": 5027, + "time_per_iteration": 2.6124234199523926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065602, + "balance_loss_mlp": 1.05140436, + "epoch": 0.9672951135051943, + "flos": 525058951680.0, + "grad_norm": 0.07445455470407839, + "language_loss": 0.80153406, + "learning_rate": 2.802372171957057e-06, + "loss": 0.81219006, + "num_input_tokens_seen": 416430688, + "router_z_loss_mlp": 0.14196777, + "step": 5028, + "time_per_iteration": 2.6191561222076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066386, + "balance_loss_mlp": 1.05184281, + "epoch": 0.9674874951904578, + "flos": 573986082816.0, + "grad_norm": 0.4574262707706258, + "language_loss": 0.79723036, + "learning_rate": 2.7695304361237682e-06, + "loss": 0.80789423, + "num_input_tokens_seen": 416505248, + "router_z_loss_mlp": 0.14526367, + "step": 5029, + "time_per_iteration": 2.830446720123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064711, + "balance_loss_mlp": 1.05021513, + "epoch": 0.9676798768757214, + "flos": 629184153600.0, + "grad_norm": 0.05422559876125122, + "language_loss": 0.79918784, + "learning_rate": 2.7368817422789848e-06, + "loss": 0.80983496, + "num_input_tokens_seen": 416592640, + "router_z_loss_mlp": 0.14489746, + "step": 5030, + "time_per_iteration": 3.0141375064849854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007963, + "balance_loss_mlp": 1.00171638, + "epoch": 0.967872258560985, + "flos": 1463880605184.0, + "grad_norm": 0.0032783952713433553, + "language_loss": 0.75563359, + "learning_rate": 2.7044261030979566e-06, + "loss": 0.76571321, + "num_input_tokens_seen": 416808560, + "router_z_loss_mlp": 0.0625, + "step": 5031, + "time_per_iteration": 4.6728925704956055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067077, + "balance_loss_mlp": 1.05293906, + "epoch": 0.9680646402462486, + "flos": 565503814656.0, + "grad_norm": 0.08898799182976663, + "language_loss": 0.79104608, + "learning_rate": 2.672163531181049e-06, + "loss": 0.80171686, + "num_input_tokens_seen": 416878208, + "router_z_loss_mlp": 0.14160156, + "step": 5032, + "time_per_iteration": 2.70428729057312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007952, + "balance_loss_mlp": 1.00170588, + "epoch": 0.9682570219315121, + "flos": 1434463022592.0, + "grad_norm": 0.003278565206881768, + "language_loss": 0.78074801, + "learning_rate": 2.6400940390537976e-06, + "loss": 0.79082751, + "num_input_tokens_seen": 417105968, + "router_z_loss_mlp": 0.0625, + "step": 5033, + "time_per_iteration": 4.792613983154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067363, + "balance_loss_mlp": 1.05304611, + "epoch": 0.9684494036167757, + "flos": 584610670080.0, + "grad_norm": 0.07336411283819118, + "language_loss": 0.81745082, + "learning_rate": 2.608217639166688e-06, + "loss": 0.82812446, + "num_input_tokens_seen": 417175168, + "router_z_loss_mlp": 0.14306641, + "step": 5034, + "time_per_iteration": 2.7284703254699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066123, + "balance_loss_mlp": 1.051615, + "epoch": 0.9686417853020393, + "flos": 559064747520.0, + "grad_norm": 0.07168723880196738, + "language_loss": 0.84213465, + "learning_rate": 2.5765343438950982e-06, + "loss": 0.8527959, + "num_input_tokens_seen": 417247760, + "router_z_loss_mlp": 0.1451416, + "step": 5035, + "time_per_iteration": 2.692577838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106653, + "balance_loss_mlp": 1.05180788, + "epoch": 0.9688341669873028, + "flos": 784927604736.0, + "grad_norm": 0.07458595130597709, + "language_loss": 0.83331645, + "learning_rate": 2.545044165539745e-06, + "loss": 0.8439818, + "num_input_tokens_seen": 417324080, + "router_z_loss_mlp": 0.14709473, + "step": 5036, + "time_per_iteration": 2.975346326828003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061475, + "balance_loss_mlp": 1.04697919, + "epoch": 0.9690265486725663, + "flos": 395899176960.0, + "grad_norm": 0.5711945724845235, + "language_loss": 0.79369569, + "learning_rate": 2.513747116326126e-06, + "loss": 0.80431038, + "num_input_tokens_seen": 417386416, + "router_z_loss_mlp": 0.14501953, + "step": 5037, + "time_per_iteration": 2.48323392868042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067034, + "balance_loss_mlp": 1.05312204, + "epoch": 0.9692189303578299, + "flos": 476373726720.0, + "grad_norm": 0.07920913629310455, + "language_loss": 0.77461714, + "learning_rate": 2.4826432084048002e-06, + "loss": 0.7852875, + "num_input_tokens_seen": 417459648, + "router_z_loss_mlp": 0.13916016, + "step": 5038, + "time_per_iteration": 2.7738237380981445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063867, + "balance_loss_mlp": 1.04974079, + "epoch": 0.9694113120430935, + "flos": 597575066112.0, + "grad_norm": 0.0715485714109308, + "language_loss": 0.78878641, + "learning_rate": 2.451732453851385e-06, + "loss": 0.79942507, + "num_input_tokens_seen": 417530512, + "router_z_loss_mlp": 0.14111328, + "step": 5039, + "time_per_iteration": 2.690324306488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106154, + "balance_loss_mlp": 1.04754531, + "epoch": 0.9696036937283571, + "flos": 500881895424.0, + "grad_norm": 0.07794078914679435, + "language_loss": 0.82386857, + "learning_rate": 2.4210148646665598e-06, + "loss": 0.83448398, + "num_input_tokens_seen": 417597600, + "router_z_loss_mlp": 0.14001465, + "step": 5040, + "time_per_iteration": 2.6172046661376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_mlp": 1.04806352, + "epoch": 0.9697960754136207, + "flos": 432277088256.0, + "grad_norm": 0.08548810268717333, + "language_loss": 0.87234342, + "learning_rate": 2.3904904527758952e-06, + "loss": 0.88297331, + "num_input_tokens_seen": 417659616, + "router_z_loss_mlp": 0.14904785, + "step": 5041, + "time_per_iteration": 2.5740058422088623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064207, + "balance_loss_mlp": 1.04924631, + "epoch": 0.9699884570988841, + "flos": 568540353024.0, + "grad_norm": 0.06792244357748194, + "language_loss": 0.85212839, + "learning_rate": 2.3601592300300235e-06, + "loss": 0.8627705, + "num_input_tokens_seen": 417730896, + "router_z_loss_mlp": 0.14929199, + "step": 5042, + "time_per_iteration": 2.8025379180908203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068002, + "balance_loss_mlp": 1.05411386, + "epoch": 0.9701808387841477, + "flos": 516215835648.0, + "grad_norm": 0.0693889530606864, + "language_loss": 0.81386518, + "learning_rate": 2.33002120820458e-06, + "loss": 0.8245452, + "num_input_tokens_seen": 417803296, + "router_z_loss_mlp": 0.13903809, + "step": 5043, + "time_per_iteration": 2.693671941757202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065828, + "balance_loss_mlp": 1.05174971, + "epoch": 0.9703732204694113, + "flos": 491517517824.0, + "grad_norm": 0.08538153831244098, + "language_loss": 0.76105028, + "learning_rate": 2.300076399000206e-06, + "loss": 0.77170855, + "num_input_tokens_seen": 417870208, + "router_z_loss_mlp": 0.14086914, + "step": 5044, + "time_per_iteration": 2.590811014175415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061301, + "balance_loss_mlp": 1.04690051, + "epoch": 0.9705656021546749, + "flos": 626120451072.0, + "grad_norm": 0.07148835916137017, + "language_loss": 0.80247957, + "learning_rate": 2.2703248140424348e-06, + "loss": 0.81309259, + "num_input_tokens_seen": 417944464, + "router_z_loss_mlp": 0.1439209, + "step": 5045, + "time_per_iteration": 2.781397819519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061296, + "balance_loss_mlp": 1.04675221, + "epoch": 0.9707579838399384, + "flos": 471437148672.0, + "grad_norm": 0.10538235080533889, + "language_loss": 0.83119071, + "learning_rate": 2.2407664648819715e-06, + "loss": 0.84180367, + "num_input_tokens_seen": 418010480, + "router_z_loss_mlp": 0.1451416, + "step": 5046, + "time_per_iteration": 2.5805857181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067595, + "balance_loss_mlp": 1.05327773, + "epoch": 0.970950365525202, + "flos": 492103019520.0, + "grad_norm": 0.08102393699609502, + "language_loss": 0.80504072, + "learning_rate": 2.2114013629942475e-06, + "loss": 0.81571662, + "num_input_tokens_seen": 418083952, + "router_z_loss_mlp": 0.14318848, + "step": 5047, + "time_per_iteration": 2.6477913856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106484, + "balance_loss_mlp": 1.05048728, + "epoch": 0.9711427472104656, + "flos": 557322923520.0, + "grad_norm": 0.07039537184358946, + "language_loss": 0.80597341, + "learning_rate": 2.1822295197799213e-06, + "loss": 0.81662178, + "num_input_tokens_seen": 418156672, + "router_z_loss_mlp": 0.14343262, + "step": 5048, + "time_per_iteration": 2.7285449504852295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065725, + "balance_loss_mlp": 1.05186129, + "epoch": 0.9713351288957291, + "flos": 625841095680.0, + "grad_norm": 0.06565780540592017, + "language_loss": 0.83665466, + "learning_rate": 2.153250946564489e-06, + "loss": 0.84731191, + "num_input_tokens_seen": 418242160, + "router_z_loss_mlp": 0.13867188, + "step": 5049, + "time_per_iteration": 2.9449574947357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066983, + "balance_loss_mlp": 1.05283248, + "epoch": 0.9715275105809927, + "flos": 499073260032.0, + "grad_norm": 0.07689693287405414, + "language_loss": 0.81132668, + "learning_rate": 2.1244656545983397e-06, + "loss": 0.82199657, + "num_input_tokens_seen": 418316960, + "router_z_loss_mlp": 0.14147949, + "step": 5050, + "time_per_iteration": 2.722886085510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063615, + "balance_loss_mlp": 1.0492146, + "epoch": 0.9717198922662562, + "flos": 477515367936.0, + "grad_norm": 0.08005861139557306, + "language_loss": 0.77713883, + "learning_rate": 2.0958736550570345e-06, + "loss": 0.78777498, + "num_input_tokens_seen": 418383888, + "router_z_loss_mlp": 0.1439209, + "step": 5051, + "time_per_iteration": 2.56706166267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106432, + "balance_loss_mlp": 1.0501821, + "epoch": 0.9719122739515198, + "flos": 553446120960.0, + "grad_norm": 0.17410068160573605, + "language_loss": 0.78690982, + "learning_rate": 2.067474959040916e-06, + "loss": 0.79755294, + "num_input_tokens_seen": 418453776, + "router_z_loss_mlp": 0.14147949, + "step": 5052, + "time_per_iteration": 2.708404302597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066819, + "balance_loss_mlp": 1.05263352, + "epoch": 0.9721046556367834, + "flos": 565852179456.0, + "grad_norm": 0.07134542484886951, + "language_loss": 0.79770613, + "learning_rate": 2.0392695775753312e-06, + "loss": 0.80837435, + "num_input_tokens_seen": 418521984, + "router_z_loss_mlp": 0.1418457, + "step": 5053, + "time_per_iteration": 2.693629264831543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_mlp": 1.05225515, + "epoch": 0.972297037322047, + "flos": 560315045376.0, + "grad_norm": 0.0739721255064351, + "language_loss": 0.78349614, + "learning_rate": 2.0112575216105766e-06, + "loss": 0.79416072, + "num_input_tokens_seen": 418598768, + "router_z_loss_mlp": 0.14196777, + "step": 5054, + "time_per_iteration": 2.7789134979248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106497, + "balance_loss_mlp": 1.05055785, + "epoch": 0.9724894190073105, + "flos": 512440349184.0, + "grad_norm": 0.06850307979501671, + "language_loss": 0.79473531, + "learning_rate": 1.9834388020218974e-06, + "loss": 0.80538499, + "num_input_tokens_seen": 418670064, + "router_z_loss_mlp": 0.14404297, + "step": 5055, + "time_per_iteration": 2.7343311309814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065145, + "balance_loss_mlp": 1.05078018, + "epoch": 0.972681800692574, + "flos": 613832961024.0, + "grad_norm": 0.08293012200245027, + "language_loss": 0.80427051, + "learning_rate": 1.9558134296094875e-06, + "loss": 0.81492198, + "num_input_tokens_seen": 418745216, + "router_z_loss_mlp": 0.14355469, + "step": 5056, + "time_per_iteration": 2.8321659564971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067271, + "balance_loss_mlp": 1.05302536, + "epoch": 0.9728741823778376, + "flos": 833911635456.0, + "grad_norm": 0.06937694690471158, + "language_loss": 0.84109455, + "learning_rate": 1.92838141509849e-06, + "loss": 0.8517673, + "num_input_tokens_seen": 418824224, + "router_z_loss_mlp": 0.14233398, + "step": 5057, + "time_per_iteration": 3.066319465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064993, + "balance_loss_mlp": 1.05038941, + "epoch": 0.9730665640631012, + "flos": 571450982400.0, + "grad_norm": 0.07422141581965605, + "language_loss": 0.84001803, + "learning_rate": 1.9011427691389415e-06, + "loss": 0.85066795, + "num_input_tokens_seen": 418899712, + "router_z_loss_mlp": 0.14611816, + "step": 5058, + "time_per_iteration": 2.7504796981811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061287, + "balance_loss_mlp": 1.04674363, + "epoch": 0.9732589457483648, + "flos": 506520345600.0, + "grad_norm": 0.07368606718448276, + "language_loss": 0.77334303, + "learning_rate": 1.8740975023057715e-06, + "loss": 0.78395593, + "num_input_tokens_seen": 418964912, + "router_z_loss_mlp": 0.1451416, + "step": 5059, + "time_per_iteration": 2.586639404296875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065659, + "balance_loss_mlp": 1.05141389, + "epoch": 0.9734513274336283, + "flos": 926977623552.0, + "grad_norm": 0.06998637393077584, + "language_loss": 0.80083954, + "learning_rate": 1.84724562509897e-06, + "loss": 0.81149614, + "num_input_tokens_seen": 419040032, + "router_z_loss_mlp": 0.14245605, + "step": 5060, + "time_per_iteration": 3.150885820388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061881, + "balance_loss_mlp": 1.04774249, + "epoch": 0.9736437091188919, + "flos": 491930122752.0, + "grad_norm": 0.09572307555688801, + "language_loss": 0.78052622, + "learning_rate": 1.8205871479433089e-06, + "loss": 0.79114503, + "num_input_tokens_seen": 419112672, + "router_z_loss_mlp": 0.14147949, + "step": 5061, + "time_per_iteration": 2.7411611080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067565, + "balance_loss_mlp": 1.05327165, + "epoch": 0.9738360908041555, + "flos": 613321611264.0, + "grad_norm": 0.07503058154901762, + "language_loss": 0.83344877, + "learning_rate": 1.7941220811885096e-06, + "loss": 0.84412444, + "num_input_tokens_seen": 419183408, + "router_z_loss_mlp": 0.14282227, + "step": 5062, + "time_per_iteration": 2.7299842834472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007973, + "balance_loss_mlp": 1.00167918, + "epoch": 0.974028472489419, + "flos": 1549561549824.0, + "grad_norm": 0.0032860520737355865, + "language_loss": 0.75992095, + "learning_rate": 1.7678504351092972e-06, + "loss": 0.7700007, + "num_input_tokens_seen": 419415472, + "router_z_loss_mlp": 0.06298828, + "step": 5063, + "time_per_iteration": 4.974630117416382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007973, + "balance_loss_mlp": 1.00167894, + "epoch": 0.9742208541746825, + "flos": 1411155965952.0, + "grad_norm": 0.0032857008369014933, + "language_loss": 0.79677713, + "learning_rate": 1.7417722199051245e-06, + "loss": 0.80685687, + "num_input_tokens_seen": 419651840, + "router_z_loss_mlp": 0.06298828, + "step": 5064, + "time_per_iteration": 4.946727752685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063186, + "balance_loss_mlp": 1.04902434, + "epoch": 0.9744132358599461, + "flos": 674884597248.0, + "grad_norm": 0.06279093313792176, + "language_loss": 0.76888525, + "learning_rate": 1.7158874457005592e-06, + "loss": 0.77951717, + "num_input_tokens_seen": 419729424, + "router_z_loss_mlp": 0.14160156, + "step": 5065, + "time_per_iteration": 2.8534483909606934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066411, + "balance_loss_mlp": 1.05205822, + "epoch": 0.9746056175452097, + "flos": 598407616512.0, + "grad_norm": 0.06682786638723953, + "language_loss": 0.77907526, + "learning_rate": 1.690196122544896e-06, + "loss": 0.78973937, + "num_input_tokens_seen": 419803616, + "router_z_loss_mlp": 0.14367676, + "step": 5066, + "time_per_iteration": 2.826382637023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066072, + "balance_loss_mlp": 1.05201697, + "epoch": 0.9747979992304733, + "flos": 732175428096.0, + "grad_norm": 0.061050992503925997, + "language_loss": 0.82334244, + "learning_rate": 1.6646982604123784e-06, + "loss": 0.83400315, + "num_input_tokens_seen": 419883536, + "router_z_loss_mlp": 0.140625, + "step": 5067, + "time_per_iteration": 3.0268359184265137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064955, + "balance_loss_mlp": 1.05038726, + "epoch": 0.9749903809157369, + "flos": 616499112960.0, + "grad_norm": 0.08601228130701646, + "language_loss": 0.76389635, + "learning_rate": 1.6393938692022548e-06, + "loss": 0.77454591, + "num_input_tokens_seen": 419956816, + "router_z_loss_mlp": 0.14550781, + "step": 5068, + "time_per_iteration": 2.7173147201538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063536, + "balance_loss_mlp": 1.04950488, + "epoch": 0.9751827626010003, + "flos": 468398039040.0, + "grad_norm": 0.6661715569516079, + "language_loss": 0.83873451, + "learning_rate": 1.6142829587384443e-06, + "loss": 0.84936988, + "num_input_tokens_seen": 420022096, + "router_z_loss_mlp": 0.14038086, + "step": 5069, + "time_per_iteration": 2.6044535636901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106529, + "balance_loss_mlp": 1.05052018, + "epoch": 0.9753751442862639, + "flos": 599215574016.0, + "grad_norm": 0.21413016416202899, + "language_loss": 0.8517248, + "learning_rate": 1.5893655387698713e-06, + "loss": 0.86237764, + "num_input_tokens_seen": 420097008, + "router_z_loss_mlp": 0.14746094, + "step": 5070, + "time_per_iteration": 2.7918972969055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067126, + "balance_loss_mlp": 1.05289221, + "epoch": 0.9755675259715275, + "flos": 650806285824.0, + "grad_norm": 0.07997280605305106, + "language_loss": 0.81889033, + "learning_rate": 1.5646416189704637e-06, + "loss": 0.82956159, + "num_input_tokens_seen": 420174960, + "router_z_loss_mlp": 0.14257812, + "step": 5071, + "time_per_iteration": 2.8993237018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064614, + "balance_loss_mlp": 1.05048764, + "epoch": 0.9757599076567911, + "flos": 563658103296.0, + "grad_norm": 0.10641549726057599, + "language_loss": 0.78939104, + "learning_rate": 1.5401112089387659e-06, + "loss": 0.80003721, + "num_input_tokens_seen": 420245248, + "router_z_loss_mlp": 0.14135742, + "step": 5072, + "time_per_iteration": 2.6914937496185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065295, + "balance_loss_mlp": 1.05088246, + "epoch": 0.9759522893420547, + "flos": 504637558272.0, + "grad_norm": 0.07895573194632217, + "language_loss": 0.80287015, + "learning_rate": 1.5157743181983819e-06, + "loss": 0.81352311, + "num_input_tokens_seen": 420310688, + "router_z_loss_mlp": 0.14404297, + "step": 5073, + "time_per_iteration": 2.655817747116089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064662, + "balance_loss_mlp": 1.05040479, + "epoch": 0.9761446710273182, + "flos": 583728560640.0, + "grad_norm": 0.07101453734648233, + "language_loss": 0.82085502, + "learning_rate": 1.4916309561976982e-06, + "loss": 0.83150166, + "num_input_tokens_seen": 420379008, + "router_z_loss_mlp": 0.14245605, + "step": 5074, + "time_per_iteration": 2.691014289855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064848, + "balance_loss_mlp": 1.05055475, + "epoch": 0.9763370527125818, + "flos": 482207468544.0, + "grad_norm": 0.09505320344300444, + "language_loss": 0.81791657, + "learning_rate": 1.4676811323099947e-06, + "loss": 0.828565, + "num_input_tokens_seen": 420445504, + "router_z_loss_mlp": 0.14282227, + "step": 5075, + "time_per_iteration": 2.6115615367889404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_mlp": 1.05314183, + "epoch": 0.9765294343978453, + "flos": 618987225600.0, + "grad_norm": 0.07513940722020379, + "language_loss": 0.78498113, + "learning_rate": 1.4439248558335561e-06, + "loss": 0.79565263, + "num_input_tokens_seen": 420520528, + "router_z_loss_mlp": 0.14025879, + "step": 5076, + "time_per_iteration": 2.7157347202301025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064035, + "balance_loss_mlp": 1.04988503, + "epoch": 0.9767218160831089, + "flos": 526573550592.0, + "grad_norm": 0.08975883867321018, + "language_loss": 0.85001129, + "learning_rate": 1.4203621359911712e-06, + "loss": 0.86065167, + "num_input_tokens_seen": 420586224, + "router_z_loss_mlp": 0.14147949, + "step": 5077, + "time_per_iteration": 2.6061902046203613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064199, + "balance_loss_mlp": 1.04989409, + "epoch": 0.9769141977683724, + "flos": 525194772480.0, + "grad_norm": 0.10763924777787955, + "language_loss": 0.8412196, + "learning_rate": 1.3969929819308557e-06, + "loss": 0.8518616, + "num_input_tokens_seen": 420655456, + "router_z_loss_mlp": 0.14294434, + "step": 5078, + "time_per_iteration": 2.6408932209014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069482, + "balance_loss_mlp": 1.05521297, + "epoch": 0.977106579453636, + "flos": 457615236096.0, + "grad_norm": 0.08732182251457153, + "language_loss": 0.80499446, + "learning_rate": 1.3738174027252416e-06, + "loss": 0.81568927, + "num_input_tokens_seen": 420733216, + "router_z_loss_mlp": 0.14257812, + "step": 5079, + "time_per_iteration": 2.821551561355591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066592, + "balance_loss_mlp": 1.05263269, + "epoch": 0.9772989611388996, + "flos": 532090861056.0, + "grad_norm": 0.06587331021593097, + "language_loss": 0.81444585, + "learning_rate": 1.3508354073719642e-06, + "loss": 0.82511181, + "num_input_tokens_seen": 420803376, + "router_z_loss_mlp": 0.13964844, + "step": 5080, + "time_per_iteration": 2.6979024410247803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106446, + "balance_loss_mlp": 1.05042887, + "epoch": 0.9774913428241632, + "flos": 755349235200.0, + "grad_norm": 0.06897133924959288, + "language_loss": 0.86031032, + "learning_rate": 1.3280470047933313e-06, + "loss": 0.87095487, + "num_input_tokens_seen": 420886256, + "router_z_loss_mlp": 0.14025879, + "step": 5081, + "time_per_iteration": 3.041377544403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007931, + "balance_loss_mlp": 1.0016849, + "epoch": 0.9776837245094268, + "flos": 1554320088576.0, + "grad_norm": 0.0032836554986033295, + "language_loss": 0.78895497, + "learning_rate": 1.3054522038366544e-06, + "loss": 0.79903424, + "num_input_tokens_seen": 421123728, + "router_z_loss_mlp": 0.0625, + "step": 5082, + "time_per_iteration": 4.9710633754730225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065578, + "balance_loss_mlp": 1.05097508, + "epoch": 0.9778761061946902, + "flos": 592534600704.0, + "grad_norm": 0.09877272311623977, + "language_loss": 0.83793986, + "learning_rate": 1.2830510132739725e-06, + "loss": 0.84859562, + "num_input_tokens_seen": 421192576, + "router_z_loss_mlp": 0.14599609, + "step": 5083, + "time_per_iteration": 2.6839962005615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066407, + "balance_loss_mlp": 1.05211425, + "epoch": 0.9780684878799538, + "flos": 414951704064.0, + "grad_norm": 0.08910611593558808, + "language_loss": 0.82052761, + "learning_rate": 1.2608434418022175e-06, + "loss": 0.83119166, + "num_input_tokens_seen": 421256272, + "router_z_loss_mlp": 0.14282227, + "step": 5084, + "time_per_iteration": 2.4916131496429443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069144, + "balance_loss_mlp": 1.05482697, + "epoch": 0.9782608695652174, + "flos": 568411872768.0, + "grad_norm": 0.0683698369039321, + "language_loss": 0.84943771, + "learning_rate": 1.2388294980431036e-06, + "loss": 0.86012912, + "num_input_tokens_seen": 421332880, + "router_z_loss_mlp": 0.14331055, + "step": 5085, + "time_per_iteration": 2.727896213531494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064723, + "balance_loss_mlp": 1.05031037, + "epoch": 0.978453251250481, + "flos": 690472926720.0, + "grad_norm": 0.08017894045097873, + "language_loss": 0.82961535, + "learning_rate": 1.217009190543239e-06, + "loss": 0.84026265, + "num_input_tokens_seen": 421406160, + "router_z_loss_mlp": 0.14416504, + "step": 5086, + "time_per_iteration": 2.8707313537597656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069677, + "balance_loss_mlp": 1.05558658, + "epoch": 0.9786456329357445, + "flos": 502505150976.0, + "grad_norm": 0.0603639816123071, + "language_loss": 0.77216703, + "learning_rate": 1.1953825277740694e-06, + "loss": 0.7828638, + "num_input_tokens_seen": 421476208, + "router_z_loss_mlp": 0.14086914, + "step": 5087, + "time_per_iteration": 2.6551520824432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062505, + "balance_loss_mlp": 1.04825974, + "epoch": 0.9788380146210081, + "flos": 863183485440.0, + "grad_norm": 0.08415951244120122, + "language_loss": 0.80294234, + "learning_rate": 1.1739495181317117e-06, + "loss": 0.8135674, + "num_input_tokens_seen": 421549232, + "router_z_loss_mlp": 0.14257812, + "step": 5088, + "time_per_iteration": 3.0204029083251953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068699, + "balance_loss_mlp": 1.05485845, + "epoch": 0.9790303963062716, + "flos": 512717133312.0, + "grad_norm": 0.07232346891322454, + "language_loss": 0.84229541, + "learning_rate": 1.1527101699371767e-06, + "loss": 0.85298246, + "num_input_tokens_seen": 421617056, + "router_z_loss_mlp": 0.1385498, + "step": 5089, + "time_per_iteration": 2.6450371742248535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068262, + "balance_loss_mlp": 1.05395687, + "epoch": 0.9792227779915352, + "flos": 494428147200.0, + "grad_norm": 0.08922991486466687, + "language_loss": 0.86236167, + "learning_rate": 1.1316644914364237e-06, + "loss": 0.87304425, + "num_input_tokens_seen": 421683424, + "router_z_loss_mlp": 0.14318848, + "step": 5090, + "time_per_iteration": 2.6085898876190186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062554, + "balance_loss_mlp": 1.04827309, + "epoch": 0.9794151596767988, + "flos": 608325562368.0, + "grad_norm": 0.06779358783176108, + "language_loss": 0.81499738, + "learning_rate": 1.1108124908000838e-06, + "loss": 0.82562292, + "num_input_tokens_seen": 421761200, + "router_z_loss_mlp": 0.14282227, + "step": 5091, + "time_per_iteration": 2.785670280456543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106137, + "balance_loss_mlp": 1.04681492, + "epoch": 0.9796075413620623, + "flos": 478222009344.0, + "grad_norm": 0.07345107204031283, + "language_loss": 0.86748743, + "learning_rate": 1.09015417612357e-06, + "loss": 0.87810111, + "num_input_tokens_seen": 421829600, + "router_z_loss_mlp": 0.14550781, + "step": 5092, + "time_per_iteration": 2.6004750728607178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_mlp": 1.04863238, + "epoch": 0.9797999230473259, + "flos": 592220740608.0, + "grad_norm": 0.06917655152428695, + "language_loss": 0.84302372, + "learning_rate": 1.0696895554271335e-06, + "loss": 0.85365528, + "num_input_tokens_seen": 421904928, + "router_z_loss_mlp": 0.1451416, + "step": 5093, + "time_per_iteration": 2.8177921772003174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106471, + "balance_loss_mlp": 1.0502739, + "epoch": 0.9799923047325895, + "flos": 556381343232.0, + "grad_norm": 0.06567011725457712, + "language_loss": 0.81470811, + "learning_rate": 1.049418636655919e-06, + "loss": 0.82535523, + "num_input_tokens_seen": 421989616, + "router_z_loss_mlp": 0.14440918, + "step": 5094, + "time_per_iteration": 2.9339916706085205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106685, + "balance_loss_mlp": 1.05187774, + "epoch": 0.9801846864178531, + "flos": 579456405504.0, + "grad_norm": 0.06290617495245203, + "language_loss": 0.84237778, + "learning_rate": 1.0293414276797974e-06, + "loss": 0.85304636, + "num_input_tokens_seen": 422067088, + "router_z_loss_mlp": 0.14953613, + "step": 5095, + "time_per_iteration": 2.7596583366394043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063683, + "balance_loss_mlp": 1.04974759, + "epoch": 0.9803770681031165, + "flos": 515101358592.0, + "grad_norm": 0.05884649286884671, + "language_loss": 0.79774284, + "learning_rate": 1.0094579362933677e-06, + "loss": 0.80837965, + "num_input_tokens_seen": 422141136, + "router_z_loss_mlp": 0.1394043, + "step": 5096, + "time_per_iteration": 2.717756509780884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066078, + "balance_loss_mlp": 1.05201113, + "epoch": 0.9805694497883801, + "flos": 566988678144.0, + "grad_norm": 0.06510968800704982, + "language_loss": 0.78243887, + "learning_rate": 9.897681702160654e-07, + "loss": 0.79309964, + "num_input_tokens_seen": 422216400, + "router_z_loss_mlp": 0.14074707, + "step": 5097, + "time_per_iteration": 2.7859761714935303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062526, + "balance_loss_mlp": 1.04820871, + "epoch": 0.9807618314736437, + "flos": 479351167488.0, + "grad_norm": 0.06784455696398038, + "language_loss": 0.7347126, + "learning_rate": 9.702721370922208e-07, + "loss": 0.74533784, + "num_input_tokens_seen": 422287664, + "router_z_loss_mlp": 0.14318848, + "step": 5098, + "time_per_iteration": 2.704630136489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061768, + "balance_loss_mlp": 1.0477488, + "epoch": 0.9809542131589073, + "flos": 545285053440.0, + "grad_norm": 0.11478990612033974, + "language_loss": 0.79899949, + "learning_rate": 9.509698444908344e-07, + "loss": 0.80961716, + "num_input_tokens_seen": 422357552, + "router_z_loss_mlp": 0.14038086, + "step": 5099, + "time_per_iteration": 2.6292612552642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065944, + "balance_loss_mlp": 1.05166292, + "epoch": 0.9811465948441709, + "flos": 520843696128.0, + "grad_norm": 0.07093256961934312, + "language_loss": 0.79454851, + "learning_rate": 9.318612999057452e-07, + "loss": 0.80520797, + "num_input_tokens_seen": 422425872, + "router_z_loss_mlp": 0.14282227, + "step": 5100, + "time_per_iteration": 2.605419874191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.05283618, + "epoch": 0.9813389765294344, + "flos": 541282341888.0, + "grad_norm": 0.07637881185525433, + "language_loss": 0.80382729, + "learning_rate": 9.129465107554635e-07, + "loss": 0.81450266, + "num_input_tokens_seen": 422495760, + "router_z_loss_mlp": 0.14672852, + "step": 5101, + "time_per_iteration": 2.6618900299072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062636, + "balance_loss_mlp": 1.04866457, + "epoch": 0.981531358214698, + "flos": 567356866560.0, + "grad_norm": 0.07326205712119045, + "language_loss": 0.84316814, + "learning_rate": 8.942254843834485e-07, + "loss": 0.85379446, + "num_input_tokens_seen": 422568112, + "router_z_loss_mlp": 0.13989258, + "step": 5102, + "time_per_iteration": 2.7000365257263184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065302, + "balance_loss_mlp": 1.05118787, + "epoch": 0.9817237398999615, + "flos": 577272241152.0, + "grad_norm": 0.07352728739479987, + "language_loss": 0.80912358, + "learning_rate": 8.756982280578307e-07, + "loss": 0.81977654, + "num_input_tokens_seen": 422641280, + "router_z_loss_mlp": 0.14123535, + "step": 5103, + "time_per_iteration": 2.716947555541992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063153, + "balance_loss_mlp": 1.04894328, + "epoch": 0.9819161215852251, + "flos": 701507547648.0, + "grad_norm": 0.0812537946664224, + "language_loss": 0.8192457, + "learning_rate": 8.573647489714676e-07, + "loss": 0.82987726, + "num_input_tokens_seen": 422720416, + "router_z_loss_mlp": 0.14208984, + "step": 5104, + "time_per_iteration": 2.9482638835906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067939, + "balance_loss_mlp": 1.05362189, + "epoch": 0.9821085032704886, + "flos": 624188104704.0, + "grad_norm": 0.0735501937900119, + "language_loss": 0.84292555, + "learning_rate": 8.392250542421653e-07, + "loss": 0.85360503, + "num_input_tokens_seen": 422800384, + "router_z_loss_mlp": 0.14306641, + "step": 5105, + "time_per_iteration": 2.8968729972839355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062589, + "balance_loss_mlp": 1.04799807, + "epoch": 0.9823008849557522, + "flos": 499505688576.0, + "grad_norm": 0.07164543786345488, + "language_loss": 0.8119458, + "learning_rate": 8.212791509122353e-07, + "loss": 0.82257169, + "num_input_tokens_seen": 422870768, + "router_z_loss_mlp": 0.14587402, + "step": 5106, + "time_per_iteration": 2.7407009601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064272, + "balance_loss_mlp": 1.04980028, + "epoch": 0.9824932666410158, + "flos": 523815994368.0, + "grad_norm": 0.08625390255537382, + "language_loss": 0.72545767, + "learning_rate": 8.035270459489929e-07, + "loss": 0.73610038, + "num_input_tokens_seen": 422942864, + "router_z_loss_mlp": 0.14453125, + "step": 5107, + "time_per_iteration": 2.7165608406066895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067535, + "balance_loss_mlp": 1.05325365, + "epoch": 0.9826856483262794, + "flos": 502663366656.0, + "grad_norm": 0.0719645103131503, + "language_loss": 0.8213681, + "learning_rate": 7.859687462443698e-07, + "loss": 0.83204341, + "num_input_tokens_seen": 423013600, + "router_z_loss_mlp": 0.14282227, + "step": 5108, + "time_per_iteration": 2.6449546813964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065727, + "balance_loss_mlp": 1.05152941, + "epoch": 0.982878030011543, + "flos": 562056869376.0, + "grad_norm": 0.06031610149525448, + "language_loss": 0.84049594, + "learning_rate": 7.686042586151354e-07, + "loss": 0.85115325, + "num_input_tokens_seen": 423093680, + "router_z_loss_mlp": 0.14196777, + "step": 5109, + "time_per_iteration": 2.8201980590820312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106353, + "balance_loss_mlp": 1.04928493, + "epoch": 0.9830704116968064, + "flos": 537101591040.0, + "grad_norm": 0.06932231070065256, + "language_loss": 0.82637227, + "learning_rate": 7.514335898027857e-07, + "loss": 0.83700758, + "num_input_tokens_seen": 423168608, + "router_z_loss_mlp": 0.14245605, + "step": 5110, + "time_per_iteration": 2.7956700325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063874, + "balance_loss_mlp": 1.04966426, + "epoch": 0.98326279338207, + "flos": 458949597696.0, + "grad_norm": 0.06270744852863061, + "language_loss": 0.84185314, + "learning_rate": 7.344567464735441e-07, + "loss": 0.85249186, + "num_input_tokens_seen": 423233552, + "router_z_loss_mlp": 0.14221191, + "step": 5111, + "time_per_iteration": 2.526143789291382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063111, + "balance_loss_mlp": 1.04903221, + "epoch": 0.9834551750673336, + "flos": 640974974976.0, + "grad_norm": 0.06650494434915036, + "language_loss": 0.79163671, + "learning_rate": 7.17673735218416e-07, + "loss": 0.80226785, + "num_input_tokens_seen": 423307440, + "router_z_loss_mlp": 0.14074707, + "step": 5112, + "time_per_iteration": 2.8292341232299805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061423, + "balance_loss_mlp": 1.04775, + "epoch": 0.9836475567525972, + "flos": 1071807220224.0, + "grad_norm": 0.07946110892144641, + "language_loss": 0.79060733, + "learning_rate": 7.010845625530782e-07, + "loss": 0.80122155, + "num_input_tokens_seen": 423394880, + "router_z_loss_mlp": 0.13696289, + "step": 5113, + "time_per_iteration": 3.4044573307037354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066495, + "balance_loss_mlp": 1.05227315, + "epoch": 0.9838399384378607, + "flos": 565209778176.0, + "grad_norm": 0.09866362357616712, + "language_loss": 0.75764799, + "learning_rate": 6.846892349181566e-07, + "loss": 0.76831293, + "num_input_tokens_seen": 423461792, + "router_z_loss_mlp": 0.14221191, + "step": 5114, + "time_per_iteration": 2.724730968475342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063835, + "balance_loss_mlp": 1.04963779, + "epoch": 0.9840323201231242, + "flos": 772805670912.0, + "grad_norm": 0.0911355229399916, + "language_loss": 0.79936361, + "learning_rate": 6.684877586787819e-07, + "loss": 0.81000197, + "num_input_tokens_seen": 423539952, + "router_z_loss_mlp": 0.14208984, + "step": 5115, + "time_per_iteration": 3.0147950649261475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064109, + "balance_loss_mlp": 1.04974413, + "epoch": 0.9842247018083878, + "flos": 472262358528.0, + "grad_norm": 0.10523121781623718, + "language_loss": 0.85520661, + "learning_rate": 6.524801401249225e-07, + "loss": 0.86584771, + "num_input_tokens_seen": 423607184, + "router_z_loss_mlp": 0.14367676, + "step": 5116, + "time_per_iteration": 2.5995094776153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064507, + "balance_loss_mlp": 1.05032063, + "epoch": 0.9844170834936514, + "flos": 525259012608.0, + "grad_norm": 0.07203158366187926, + "language_loss": 0.84932005, + "learning_rate": 6.366663854713295e-07, + "loss": 0.85996509, + "num_input_tokens_seen": 423676528, + "router_z_loss_mlp": 0.14196777, + "step": 5117, + "time_per_iteration": 2.637052297592163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007922, + "balance_loss_mlp": 1.00167584, + "epoch": 0.984609465178915, + "flos": 1567247408640.0, + "grad_norm": 0.0032849089870143, + "language_loss": 0.77162516, + "learning_rate": 6.210465008574251e-07, + "loss": 0.78170443, + "num_input_tokens_seen": 423905856, + "router_z_loss_mlp": 0.0625, + "step": 5118, + "time_per_iteration": 4.90720272064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068707, + "balance_loss_mlp": 1.0540328, + "epoch": 0.9848018468641785, + "flos": 519548981760.0, + "grad_norm": 0.09557736917405237, + "language_loss": 0.82077289, + "learning_rate": 6.056204923473584e-07, + "loss": 0.83145994, + "num_input_tokens_seen": 423972496, + "router_z_loss_mlp": 0.14672852, + "step": 5119, + "time_per_iteration": 2.6469926834106445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065256, + "balance_loss_mlp": 1.05113006, + "epoch": 0.9849942285494421, + "flos": 493004952576.0, + "grad_norm": 0.07479661629278153, + "language_loss": 0.82782626, + "learning_rate": 5.903883659301167e-07, + "loss": 0.83847886, + "num_input_tokens_seen": 424039968, + "router_z_loss_mlp": 0.14147949, + "step": 5120, + "time_per_iteration": 2.576946973800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066832, + "balance_loss_mlp": 1.05213332, + "epoch": 0.9851866102347057, + "flos": 546001606656.0, + "grad_norm": 0.08446497011390579, + "language_loss": 0.80810666, + "learning_rate": 5.753501275193029e-07, + "loss": 0.81877494, + "num_input_tokens_seen": 424108096, + "router_z_loss_mlp": 0.14685059, + "step": 5121, + "time_per_iteration": 2.6319987773895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063985, + "balance_loss_mlp": 1.04960883, + "epoch": 0.9853789919199692, + "flos": 476257729536.0, + "grad_norm": 0.07681446659102178, + "language_loss": 0.80095053, + "learning_rate": 5.605057829531912e-07, + "loss": 0.81159031, + "num_input_tokens_seen": 424172256, + "router_z_loss_mlp": 0.14355469, + "step": 5122, + "time_per_iteration": 2.5240464210510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061639, + "balance_loss_mlp": 1.04738188, + "epoch": 0.9855713736052328, + "flos": 1032619995648.0, + "grad_norm": 0.08827178594358556, + "language_loss": 0.76197588, + "learning_rate": 5.458553379950049e-07, + "loss": 0.77259231, + "num_input_tokens_seen": 424261088, + "router_z_loss_mlp": 0.14245605, + "step": 5123, + "time_per_iteration": 3.3713111877441406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063782, + "balance_loss_mlp": 1.04914308, + "epoch": 0.9857637552904963, + "flos": 495050724864.0, + "grad_norm": 0.06078629887219036, + "language_loss": 0.82555091, + "learning_rate": 5.31398798332472e-07, + "loss": 0.83618873, + "num_input_tokens_seen": 424329168, + "router_z_loss_mlp": 0.14611816, + "step": 5124, + "time_per_iteration": 2.6111574172973633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_mlp": 1.05182135, + "epoch": 0.9859561369757599, + "flos": 592267728384.0, + "grad_norm": 0.083267958532, + "language_loss": 0.83494437, + "learning_rate": 5.17136169578103e-07, + "loss": 0.84560442, + "num_input_tokens_seen": 424399392, + "router_z_loss_mlp": 0.1418457, + "step": 5125, + "time_per_iteration": 2.6993632316589355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065376, + "balance_loss_mlp": 1.05084407, + "epoch": 0.9861485186610235, + "flos": 486971149824.0, + "grad_norm": 0.07420331349038331, + "language_loss": 0.78526759, + "learning_rate": 5.030674572691907e-07, + "loss": 0.79592133, + "num_input_tokens_seen": 424470080, + "router_z_loss_mlp": 0.1451416, + "step": 5126, + "time_per_iteration": 2.6422102451324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063683, + "balance_loss_mlp": 1.04927087, + "epoch": 0.9863409003462871, + "flos": 518795352576.0, + "grad_norm": 0.058719013757474826, + "language_loss": 0.82536149, + "learning_rate": 4.891926668676994e-07, + "loss": 0.83599836, + "num_input_tokens_seen": 424541824, + "router_z_loss_mlp": 0.14404297, + "step": 5127, + "time_per_iteration": 2.7151970863342285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007972, + "balance_loss_mlp": 1.00167775, + "epoch": 0.9865332820315506, + "flos": 1486026570240.0, + "grad_norm": 0.003284256404778656, + "language_loss": 0.79182732, + "learning_rate": 4.755118037602646e-07, + "loss": 0.80190706, + "num_input_tokens_seen": 424773408, + "router_z_loss_mlp": 0.06298828, + "step": 5128, + "time_per_iteration": 4.89760160446167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067477, + "balance_loss_mlp": 1.05338657, + "epoch": 0.9867256637168141, + "flos": 582112645632.0, + "grad_norm": 0.06977988742925464, + "language_loss": 0.78998387, + "learning_rate": 4.620248732582488e-07, + "loss": 0.80065858, + "num_input_tokens_seen": 424840608, + "router_z_loss_mlp": 0.14111328, + "step": 5129, + "time_per_iteration": 2.7023425102233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063752, + "balance_loss_mlp": 1.05001903, + "epoch": 0.9869180454020777, + "flos": 959303264256.0, + "grad_norm": 0.1397619668456288, + "language_loss": 0.86259735, + "learning_rate": 4.487318805977969e-07, + "loss": 0.87323487, + "num_input_tokens_seen": 424926128, + "router_z_loss_mlp": 0.13757324, + "step": 5130, + "time_per_iteration": 3.240145444869995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065812, + "balance_loss_mlp": 1.05161428, + "epoch": 0.9871104270873413, + "flos": 770730163200.0, + "grad_norm": 0.07001869751455264, + "language_loss": 0.82417846, + "learning_rate": 4.3563283093966954e-07, + "loss": 0.8348366, + "num_input_tokens_seen": 425005744, + "router_z_loss_mlp": 0.14196777, + "step": 5131, + "time_per_iteration": 3.044957399368286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063083, + "balance_loss_mlp": 1.04839683, + "epoch": 0.9873028087726049, + "flos": 446444794368.0, + "grad_norm": 0.08295760254842617, + "language_loss": 0.77687156, + "learning_rate": 4.2272772936940986e-07, + "loss": 0.78750235, + "num_input_tokens_seen": 425068112, + "router_z_loss_mlp": 0.14672852, + "step": 5132, + "time_per_iteration": 2.5449488162994385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063168, + "balance_loss_mlp": 1.04895806, + "epoch": 0.9874951904578684, + "flos": 507612427776.0, + "grad_norm": 0.06787160975467058, + "language_loss": 0.86360145, + "learning_rate": 4.1001658089717676e-07, + "loss": 0.87423307, + "num_input_tokens_seen": 425137408, + "router_z_loss_mlp": 0.14221191, + "step": 5133, + "time_per_iteration": 2.5786757469177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070832, + "balance_loss_mlp": 1.05663383, + "epoch": 0.987687572143132, + "flos": 716742743040.0, + "grad_norm": 0.07149794752795115, + "language_loss": 0.82624304, + "learning_rate": 3.9749939045791164e-07, + "loss": 0.83695138, + "num_input_tokens_seen": 425213504, + "router_z_loss_mlp": 0.14196777, + "step": 5134, + "time_per_iteration": 2.9011013507843018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007975, + "balance_loss_mlp": 1.00168061, + "epoch": 0.9878799538283956, + "flos": 1538647695360.0, + "grad_norm": 0.0032849775675939607, + "language_loss": 0.79817951, + "learning_rate": 3.851761629111716e-07, + "loss": 0.80825925, + "num_input_tokens_seen": 425451296, + "router_z_loss_mlp": 0.06298828, + "step": 5135, + "time_per_iteration": 4.909507989883423 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068183, + "balance_loss_mlp": 1.0540688, + "epoch": 0.9880723355136591, + "flos": 721424931840.0, + "grad_norm": 0.06089333863399881, + "language_loss": 0.81941283, + "learning_rate": 3.730469030412964e-07, + "loss": 0.83009458, + "num_input_tokens_seen": 425527536, + "router_z_loss_mlp": 0.14111328, + "step": 5136, + "time_per_iteration": 2.9317398071289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070456, + "balance_loss_mlp": 1.05611491, + "epoch": 0.9882647171989226, + "flos": 557350087680.0, + "grad_norm": 0.06444358386944021, + "language_loss": 0.84564662, + "learning_rate": 3.611116155572969e-07, + "loss": 0.85635114, + "num_input_tokens_seen": 425596608, + "router_z_loss_mlp": 0.14318848, + "step": 5137, + "time_per_iteration": 2.7681379318237305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068262, + "balance_loss_mlp": 1.05404043, + "epoch": 0.9884570988841862, + "flos": 562820410368.0, + "grad_norm": 0.08053683664726487, + "language_loss": 0.80556041, + "learning_rate": 3.493703050927999e-07, + "loss": 0.81624299, + "num_input_tokens_seen": 425667280, + "router_z_loss_mlp": 0.14221191, + "step": 5138, + "time_per_iteration": 2.717684745788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063653, + "balance_loss_mlp": 1.04932356, + "epoch": 0.9886494805694498, + "flos": 431763167232.0, + "grad_norm": 0.08261079522387915, + "language_loss": 0.86220396, + "learning_rate": 3.378229762062146e-07, + "loss": 0.87284046, + "num_input_tokens_seen": 425730736, + "router_z_loss_mlp": 0.14331055, + "step": 5139, + "time_per_iteration": 2.5395169258117676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062583, + "balance_loss_mlp": 1.0483849, + "epoch": 0.9888418622547134, + "flos": 592082348544.0, + "grad_norm": 0.06860715832060843, + "language_loss": 0.9065218, + "learning_rate": 3.264696333806771e-07, + "loss": 0.91714764, + "num_input_tokens_seen": 425807616, + "router_z_loss_mlp": 0.14196777, + "step": 5140, + "time_per_iteration": 2.7795023918151855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064018, + "balance_loss_mlp": 1.04983258, + "epoch": 0.989034243939977, + "flos": 1134993461760.0, + "grad_norm": 0.06501878565104381, + "language_loss": 0.80251312, + "learning_rate": 3.1531028102388394e-07, + "loss": 0.81315333, + "num_input_tokens_seen": 425900880, + "router_z_loss_mlp": 0.1418457, + "step": 5141, + "time_per_iteration": 3.5281801223754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067707, + "balance_loss_mlp": 1.05318689, + "epoch": 0.9892266256252404, + "flos": 566670048768.0, + "grad_norm": 0.0888536898085742, + "language_loss": 0.82055813, + "learning_rate": 3.0434492346825824e-07, + "loss": 0.83123523, + "num_input_tokens_seen": 425973632, + "router_z_loss_mlp": 0.14501953, + "step": 5142, + "time_per_iteration": 2.697338104248047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066844, + "balance_loss_mlp": 1.05258632, + "epoch": 0.989419007310504, + "flos": 640577051136.0, + "grad_norm": 0.07447901058049321, + "language_loss": 0.84180474, + "learning_rate": 2.9357356497095033e-07, + "loss": 0.85247314, + "num_input_tokens_seen": 426057088, + "router_z_loss_mlp": 0.14257812, + "step": 5143, + "time_per_iteration": 2.9698703289031982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065942, + "balance_loss_mlp": 1.05180383, + "epoch": 0.9896113889957676, + "flos": 455478059520.0, + "grad_norm": 0.07747172431419576, + "language_loss": 0.81499732, + "learning_rate": 2.829962097138372e-07, + "loss": 0.82565677, + "num_input_tokens_seen": 426124336, + "router_z_loss_mlp": 0.14135742, + "step": 5144, + "time_per_iteration": 2.6792209148406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_mlp": 1.05051708, + "epoch": 0.9898037706810312, + "flos": 567339614208.0, + "grad_norm": 0.0871849348343538, + "language_loss": 0.80570358, + "learning_rate": 2.726128618033008e-07, + "loss": 0.81635183, + "num_input_tokens_seen": 426191888, + "router_z_loss_mlp": 0.14306641, + "step": 5145, + "time_per_iteration": 2.656355142593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01007929, + "balance_loss_mlp": 1.0016824, + "epoch": 0.9899961523662947, + "flos": 1550268191232.0, + "grad_norm": 0.0032846387566116595, + "language_loss": 0.78146422, + "learning_rate": 2.624235252706164e-07, + "loss": 0.79154348, + "num_input_tokens_seen": 426425840, + "router_z_loss_mlp": 0.0625, + "step": 5146, + "time_per_iteration": 4.951949834823608 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065686, + "balance_loss_mlp": 1.05124998, + "epoch": 0.9901885340515583, + "flos": 610709787648.0, + "grad_norm": 0.06900468674588564, + "language_loss": 0.85261124, + "learning_rate": 2.524282040715642e-07, + "loss": 0.86326808, + "num_input_tokens_seen": 426506080, + "router_z_loss_mlp": 0.14428711, + "step": 5147, + "time_per_iteration": 2.8891849517822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061196, + "balance_loss_mlp": 1.04692626, + "epoch": 0.9903809157368219, + "flos": 517483385856.0, + "grad_norm": 0.06704626028114179, + "language_loss": 0.82689345, + "learning_rate": 2.426269020866512e-07, + "loss": 0.83750546, + "num_input_tokens_seen": 426573936, + "router_z_loss_mlp": 0.14257812, + "step": 5148, + "time_per_iteration": 2.569988965988159 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068071, + "balance_loss_mlp": 1.05389738, + "epoch": 0.9905732974220854, + "flos": 1100426757120.0, + "grad_norm": 0.06984824296340629, + "language_loss": 0.8062039, + "learning_rate": 2.3301962312122226e-07, + "loss": 0.81688464, + "num_input_tokens_seen": 426657472, + "router_z_loss_mlp": 0.1418457, + "step": 5149, + "time_per_iteration": 3.4324331283569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106702, + "balance_loss_mlp": 1.05286968, + "epoch": 0.990765679107349, + "flos": 858002056704.0, + "grad_norm": 0.08001176069613011, + "language_loss": 0.84435785, + "learning_rate": 2.2360637090496073e-07, + "loss": 0.85502803, + "num_input_tokens_seen": 426740560, + "router_z_loss_mlp": 0.14147949, + "step": 5150, + "time_per_iteration": 3.1345553398132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063045, + "balance_loss_mlp": 1.04887056, + "epoch": 0.9909580607926125, + "flos": 491287721472.0, + "grad_norm": 0.07724815370290013, + "language_loss": 0.80111492, + "learning_rate": 2.143871490925542e-07, + "loss": 0.81174541, + "num_input_tokens_seen": 426809296, + "router_z_loss_mlp": 0.14172363, + "step": 5151, + "time_per_iteration": 2.584542751312256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064273, + "balance_loss_mlp": 1.04981256, + "epoch": 0.9911504424778761, + "flos": 585060350976.0, + "grad_norm": 0.06559857995855996, + "language_loss": 0.79478276, + "learning_rate": 2.0536196126319519e-07, + "loss": 0.80542547, + "num_input_tokens_seen": 426881056, + "router_z_loss_mlp": 0.14440918, + "step": 5152, + "time_per_iteration": 2.7145586013793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062084, + "balance_loss_mlp": 1.04769564, + "epoch": 0.9913428241631397, + "flos": 570030359040.0, + "grad_norm": 0.07497960334620508, + "language_loss": 0.81697887, + "learning_rate": 1.9653081092074753e-07, + "loss": 0.8275997, + "num_input_tokens_seen": 426949664, + "router_z_loss_mlp": 0.14379883, + "step": 5153, + "time_per_iteration": 2.6985859870910645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066223, + "balance_loss_mlp": 1.05188227, + "epoch": 0.9915352058484033, + "flos": 489745958400.0, + "grad_norm": 0.0690511487953324, + "language_loss": 0.85916805, + "learning_rate": 1.8789370149374652e-07, + "loss": 0.86983025, + "num_input_tokens_seen": 427018816, + "router_z_loss_mlp": 0.14343262, + "step": 5154, + "time_per_iteration": 2.670865058898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063538, + "balance_loss_mlp": 1.04945898, + "epoch": 0.9917275875336667, + "flos": 744047741952.0, + "grad_norm": 0.06727486345709939, + "language_loss": 0.82774746, + "learning_rate": 1.7945063633545423e-07, + "loss": 0.83838284, + "num_input_tokens_seen": 427097984, + "router_z_loss_mlp": 0.14086914, + "step": 5155, + "time_per_iteration": 3.022193670272827 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066316, + "balance_loss_mlp": 1.05217803, + "epoch": 0.9919199692189303, + "flos": 508272081408.0, + "grad_norm": 0.09168443128233669, + "language_loss": 0.80004323, + "learning_rate": 1.7120161872380412e-07, + "loss": 0.81070638, + "num_input_tokens_seen": 427169280, + "router_z_loss_mlp": 0.14135742, + "step": 5156, + "time_per_iteration": 2.74088716506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065009, + "balance_loss_mlp": 1.0509547, + "epoch": 0.9921123509041939, + "flos": 543963174912.0, + "grad_norm": 0.06903037281830961, + "language_loss": 0.83948219, + "learning_rate": 1.6314665186123457e-07, + "loss": 0.85013229, + "num_input_tokens_seen": 427237312, + "router_z_loss_mlp": 0.14074707, + "step": 5157, + "time_per_iteration": 2.6691086292266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063591, + "balance_loss_mlp": 1.04928589, + "epoch": 0.9923047325894575, + "flos": 671561362944.0, + "grad_norm": 0.07427130857167381, + "language_loss": 0.7724582, + "learning_rate": 1.5528573887507724e-07, + "loss": 0.78309411, + "num_input_tokens_seen": 427305008, + "router_z_loss_mlp": 0.14294434, + "step": 5158, + "time_per_iteration": 2.823317289352417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064823, + "balance_loss_mlp": 1.0502553, + "epoch": 0.9924971142747211, + "flos": 466557096960.0, + "grad_norm": 0.06658108495263643, + "language_loss": 0.80721772, + "learning_rate": 1.4761888281711322e-07, + "loss": 0.81786597, + "num_input_tokens_seen": 427377008, + "router_z_loss_mlp": 0.14562988, + "step": 5159, + "time_per_iteration": 2.6917340755462646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_mlp": 1.05381215, + "epoch": 0.9926894959599846, + "flos": 491581757952.0, + "grad_norm": 0.06964329318047109, + "language_loss": 0.82796186, + "learning_rate": 1.4014608666390594e-07, + "loss": 0.83864188, + "num_input_tokens_seen": 427444528, + "router_z_loss_mlp": 0.1418457, + "step": 5160, + "time_per_iteration": 2.599081039428711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072872, + "balance_loss_mlp": 1.05843568, + "epoch": 0.9928818776452482, + "flos": 492389715456.0, + "grad_norm": 0.06970897228596049, + "language_loss": 0.81670171, + "learning_rate": 1.328673533166902e-07, + "loss": 0.82743043, + "num_input_tokens_seen": 427509808, + "router_z_loss_mlp": 0.14416504, + "step": 5161, + "time_per_iteration": 2.6220340728759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066514, + "balance_loss_mlp": 1.05213773, + "epoch": 0.9930742593305117, + "flos": 546357312000.0, + "grad_norm": 0.07366206225814581, + "language_loss": 0.84272861, + "learning_rate": 1.2578268560131666e-07, + "loss": 0.85339379, + "num_input_tokens_seen": 427587936, + "router_z_loss_mlp": 0.14355469, + "step": 5162, + "time_per_iteration": 2.735678195953369 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065503, + "balance_loss_mlp": 1.05122125, + "epoch": 0.9932666410157753, + "flos": 585510031872.0, + "grad_norm": 0.06375322147647451, + "language_loss": 0.85993826, + "learning_rate": 1.1889208626825188e-07, + "loss": 0.87059331, + "num_input_tokens_seen": 427662224, + "router_z_loss_mlp": 0.1427002, + "step": 5163, + "time_per_iteration": 2.778244972229004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066413, + "balance_loss_mlp": 1.05194068, + "epoch": 0.9934590227010388, + "flos": 537086909952.0, + "grad_norm": 0.06627215251949191, + "language_loss": 0.83730602, + "learning_rate": 1.1219555799268921e-07, + "loss": 0.84797013, + "num_input_tokens_seen": 427730544, + "router_z_loss_mlp": 0.14465332, + "step": 5164, + "time_per_iteration": 2.6427829265594482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064806, + "balance_loss_mlp": 1.05076265, + "epoch": 0.9936514043863024, + "flos": 518014559232.0, + "grad_norm": 0.07333210721360668, + "language_loss": 0.86763346, + "learning_rate": 1.0569310337443794e-07, + "loss": 0.87828159, + "num_input_tokens_seen": 427799760, + "router_z_loss_mlp": 0.14038086, + "step": 5165, + "time_per_iteration": 2.648057699203491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065731, + "balance_loss_mlp": 1.05143774, + "epoch": 0.993843786071566, + "flos": 744625903104.0, + "grad_norm": 0.09605012053498939, + "language_loss": 0.80456662, + "learning_rate": 9.938472493803419e-08, + "loss": 0.81522393, + "num_input_tokens_seen": 427881936, + "router_z_loss_mlp": 0.14294434, + "step": 5166, + "time_per_iteration": 3.0385072231292725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060117, + "balance_loss_mlp": 1.04587162, + "epoch": 0.9940361677568296, + "flos": 525918666240.0, + "grad_norm": 0.08892246655608081, + "language_loss": 0.82095218, + "learning_rate": 9.327042513251893e-08, + "loss": 0.83155328, + "num_input_tokens_seen": 427951648, + "router_z_loss_mlp": 0.14245605, + "step": 5167, + "time_per_iteration": 2.6904261112213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062083, + "balance_loss_mlp": 1.04747951, + "epoch": 0.9942285494420932, + "flos": 555650108928.0, + "grad_norm": 0.08487084317420483, + "language_loss": 0.79729229, + "learning_rate": 8.735020633177104e-08, + "loss": 0.80791312, + "num_input_tokens_seen": 428031184, + "router_z_loss_mlp": 0.14599609, + "step": 5168, + "time_per_iteration": 2.8531885147094727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061677, + "balance_loss_mlp": 1.0473485, + "epoch": 0.9944209311273566, + "flos": 585996788736.0, + "grad_norm": 0.06702061083159072, + "language_loss": 0.82122445, + "learning_rate": 8.162407083411872e-08, + "loss": 0.83184129, + "num_input_tokens_seen": 428107296, + "router_z_loss_mlp": 0.14318848, + "step": 5169, + "time_per_iteration": 2.822988271713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106293, + "balance_loss_mlp": 1.04876781, + "epoch": 0.9946133128126202, + "flos": 735518486016.0, + "grad_norm": 0.06861911155023592, + "language_loss": 0.82474887, + "learning_rate": 7.609202086272804e-08, + "loss": 0.83537817, + "num_input_tokens_seen": 428187904, + "router_z_loss_mlp": 0.1418457, + "step": 5170, + "time_per_iteration": 3.060026168823242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069695, + "balance_loss_mlp": 1.0555805, + "epoch": 0.9948056944978838, + "flos": 646018011648.0, + "grad_norm": 0.0773612646357127, + "language_loss": 0.82002652, + "learning_rate": 7.075405856526995e-08, + "loss": 0.83072352, + "num_input_tokens_seen": 428255856, + "router_z_loss_mlp": 0.14111328, + "step": 5171, + "time_per_iteration": 2.75346040725708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_mlp": 1.05016601, + "epoch": 0.9949980761831474, + "flos": 445846809600.0, + "grad_norm": 0.07220922627510916, + "language_loss": 0.86264348, + "learning_rate": 6.561018601414226e-08, + "loss": 0.87328553, + "num_input_tokens_seen": 428321872, + "router_z_loss_mlp": 0.14050293, + "step": 5172, + "time_per_iteration": 2.525021553039551 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064486, + "balance_loss_mlp": 1.05007386, + "epoch": 0.995190457868411, + "flos": 435637398528.0, + "grad_norm": 0.07325225932553031, + "language_loss": 0.85702819, + "learning_rate": 6.066040520641414e-08, + "loss": 0.86767304, + "num_input_tokens_seen": 428389232, + "router_z_loss_mlp": 0.14416504, + "step": 5173, + "time_per_iteration": 2.564004421234131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060993, + "balance_loss_mlp": 1.04647326, + "epoch": 0.9953828395536745, + "flos": 514187315712.0, + "grad_norm": 0.08617003715305835, + "language_loss": 0.81493837, + "learning_rate": 5.590471806377062e-08, + "loss": 0.82554829, + "num_input_tokens_seen": 428456128, + "router_z_loss_mlp": 0.14526367, + "step": 5174, + "time_per_iteration": 2.6167569160461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069772, + "balance_loss_mlp": 1.05562162, + "epoch": 0.995575221238938, + "flos": 479847836160.0, + "grad_norm": 0.07342208107709478, + "language_loss": 0.81817365, + "learning_rate": 5.134312643245709e-08, + "loss": 0.82887137, + "num_input_tokens_seen": 428523504, + "router_z_loss_mlp": 0.14135742, + "step": 5175, + "time_per_iteration": 2.56459641456604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064284, + "balance_loss_mlp": 1.04964542, + "epoch": 0.9957676029242016, + "flos": 587785600512.0, + "grad_norm": 0.08029056667757119, + "language_loss": 0.76727438, + "learning_rate": 4.6975632083445793e-08, + "loss": 0.77791721, + "num_input_tokens_seen": 428596880, + "router_z_loss_mlp": 0.14611816, + "step": 5176, + "time_per_iteration": 2.7907845973968506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065785, + "balance_loss_mlp": 1.05072904, + "epoch": 0.9959599846094652, + "flos": 426465741312.0, + "grad_norm": 0.07919759530962187, + "language_loss": 0.79668772, + "learning_rate": 4.280223671243588e-08, + "loss": 0.80734563, + "num_input_tokens_seen": 428659472, + "router_z_loss_mlp": 0.15039062, + "step": 5177, + "time_per_iteration": 2.520141124725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060542, + "balance_loss_mlp": 1.04635572, + "epoch": 0.9961523662947287, + "flos": 611619061248.0, + "grad_norm": 0.0661716216299747, + "language_loss": 0.80615926, + "learning_rate": 3.8822941939575804e-08, + "loss": 0.81676465, + "num_input_tokens_seen": 428736704, + "router_z_loss_mlp": 0.14196777, + "step": 5178, + "time_per_iteration": 2.9379143714904785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062706, + "balance_loss_mlp": 1.04836535, + "epoch": 0.9963447479799923, + "flos": 550785111552.0, + "grad_norm": 0.07409996739278059, + "language_loss": 0.73854387, + "learning_rate": 3.5037749309851927e-08, + "loss": 0.74917096, + "num_input_tokens_seen": 428808560, + "router_z_loss_mlp": 0.14331055, + "step": 5179, + "time_per_iteration": 2.6890337467193604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_mlp": 1.05553138, + "epoch": 0.9965371296652559, + "flos": 625873402368.0, + "grad_norm": 0.08140162652865764, + "language_loss": 0.88694125, + "learning_rate": 3.1446660292755446e-08, + "loss": 0.89763814, + "num_input_tokens_seen": 428880688, + "router_z_loss_mlp": 0.14172363, + "step": 5180, + "time_per_iteration": 2.7570343017578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069873, + "balance_loss_mlp": 1.05535316, + "epoch": 0.9967295113505195, + "flos": 639522044928.0, + "grad_norm": 0.10737901389805089, + "language_loss": 0.81821299, + "learning_rate": 2.8049676282504433e-08, + "loss": 0.82891166, + "num_input_tokens_seen": 428960096, + "router_z_loss_mlp": 0.14526367, + "step": 5181, + "time_per_iteration": 2.881687641143799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069285, + "balance_loss_mlp": 1.05515885, + "epoch": 0.996921893035783, + "flos": 607389124608.0, + "grad_norm": 0.07807155766477335, + "language_loss": 0.7710281, + "learning_rate": 2.484679859793282e-08, + "loss": 0.78172094, + "num_input_tokens_seen": 429031296, + "router_z_loss_mlp": 0.14123535, + "step": 5182, + "time_per_iteration": 2.8261380195617676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064976, + "balance_loss_mlp": 1.05039656, + "epoch": 0.9971142747210465, + "flos": 644162388480.0, + "grad_norm": 0.07614598959451568, + "language_loss": 0.8217324, + "learning_rate": 2.183802848243488e-08, + "loss": 0.83238214, + "num_input_tokens_seen": 429103312, + "router_z_loss_mlp": 0.14550781, + "step": 5183, + "time_per_iteration": 2.8276331424713135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062789, + "balance_loss_mlp": 1.0486865, + "epoch": 0.9973066564063101, + "flos": 1040773722624.0, + "grad_norm": 0.08041083784391524, + "language_loss": 0.80840302, + "learning_rate": 1.9023367104187285e-08, + "loss": 0.81903088, + "num_input_tokens_seen": 429194896, + "router_z_loss_mlp": 0.14123535, + "step": 5184, + "time_per_iteration": 3.434018135070801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072435, + "balance_loss_mlp": 1.05810559, + "epoch": 0.9974990380915737, + "flos": 665095131648.0, + "grad_norm": 0.08089195273991168, + "language_loss": 0.83247042, + "learning_rate": 1.640281555587153e-08, + "loss": 0.84319472, + "num_input_tokens_seen": 429267664, + "router_z_loss_mlp": 0.14331055, + "step": 5185, + "time_per_iteration": 2.8602936267852783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061744, + "balance_loss_mlp": 1.04755795, + "epoch": 0.9976914197768373, + "flos": 718121521152.0, + "grad_norm": 0.07909774692148493, + "language_loss": 0.77502704, + "learning_rate": 1.3976374855007024e-08, + "loss": 0.78564447, + "num_input_tokens_seen": 429343472, + "router_z_loss_mlp": 0.1418457, + "step": 5186, + "time_per_iteration": 2.864870071411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106677, + "balance_loss_mlp": 1.05215502, + "epoch": 0.9978838014621008, + "flos": 518328419328.0, + "grad_norm": 0.07734126987679583, + "language_loss": 0.79241562, + "learning_rate": 1.1744045943451464e-08, + "loss": 0.8030833, + "num_input_tokens_seen": 429411472, + "router_z_loss_mlp": 0.14599609, + "step": 5187, + "time_per_iteration": 2.6048929691314697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064491, + "balance_loss_mlp": 1.05000699, + "epoch": 0.9980761831473643, + "flos": 603430829568.0, + "grad_norm": 0.07051998739206643, + "language_loss": 0.84329116, + "learning_rate": 9.70582968801148e-09, + "loss": 0.85393608, + "num_input_tokens_seen": 429486704, + "router_z_loss_mlp": 0.14465332, + "step": 5188, + "time_per_iteration": 2.8364462852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065717, + "balance_loss_mlp": 1.05131662, + "epoch": 0.9982685648326279, + "flos": 453523691520.0, + "grad_norm": 0.07087888956754207, + "language_loss": 0.89041173, + "learning_rate": 7.861726879943021e-09, + "loss": 0.90106881, + "num_input_tokens_seen": 429554736, + "router_z_loss_mlp": 0.1439209, + "step": 5189, + "time_per_iteration": 2.5594921112060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067352, + "balance_loss_mlp": 1.05292726, + "epoch": 0.9984609465178915, + "flos": 481424103936.0, + "grad_norm": 0.09374409580915176, + "language_loss": 0.78683227, + "learning_rate": 6.211738235173403e-09, + "loss": 0.7975058, + "num_input_tokens_seen": 429623216, + "router_z_loss_mlp": 0.14416504, + "step": 5190, + "time_per_iteration": 2.660878896713257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106628, + "balance_loss_mlp": 1.05229664, + "epoch": 0.9986533282031551, + "flos": 476941976064.0, + "grad_norm": 0.06560028389559337, + "language_loss": 0.84236324, + "learning_rate": 4.755864394301312e-09, + "loss": 0.85302609, + "num_input_tokens_seen": 429695808, + "router_z_loss_mlp": 0.13989258, + "step": 5191, + "time_per_iteration": 2.663154363632202 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069579, + "balance_loss_mlp": 1.05502343, + "epoch": 0.9988457098884186, + "flos": 641948488704.0, + "grad_norm": 0.06484380916605655, + "language_loss": 0.8642782, + "learning_rate": 3.494105922541291e-09, + "loss": 0.87497401, + "num_input_tokens_seen": 429774464, + "router_z_loss_mlp": 0.14526367, + "step": 5192, + "time_per_iteration": 2.8266706466674805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01065596, + "balance_loss_mlp": 1.05106461, + "epoch": 0.9990380915736822, + "flos": 396321693696.0, + "grad_norm": 0.1303267741249794, + "language_loss": 0.87754923, + "learning_rate": 2.4264633097237365e-09, + "loss": 0.88820517, + "num_input_tokens_seen": 429835872, + "router_z_loss_mlp": 0.14538574, + "step": 5193, + "time_per_iteration": 2.4542324542999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01063923, + "balance_loss_mlp": 1.04993951, + "epoch": 0.9992304732589458, + "flos": 576123259392.0, + "grad_norm": 0.07524852262078693, + "language_loss": 0.84832311, + "learning_rate": 1.552936970405927e-09, + "loss": 0.8589623, + "num_input_tokens_seen": 429911440, + "router_z_loss_mlp": 0.13989258, + "step": 5194, + "time_per_iteration": 2.7570321559906006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069294, + "balance_loss_mlp": 1.05532289, + "epoch": 0.9994228549442093, + "flos": 544291716096.0, + "grad_norm": 0.09657930255398448, + "language_loss": 0.75726849, + "learning_rate": 8.735272437054853e-10, + "loss": 0.76796138, + "num_input_tokens_seen": 429982512, + "router_z_loss_mlp": 0.13964844, + "step": 5195, + "time_per_iteration": 2.6761112213134766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068246, + "balance_loss_mlp": 1.05446517, + "epoch": 0.9996152366294728, + "flos": 1471314502656.0, + "grad_norm": 0.07319261176496342, + "language_loss": 0.80473548, + "learning_rate": 3.882343933003796e-10, + "loss": 0.815418, + "num_input_tokens_seen": 430070944, + "router_z_loss_mlp": 0.13787842, + "step": 5196, + "time_per_iteration": 3.7586510181427 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048015, + "balance_loss_mlp": 1.03534341, + "epoch": 0.9998076183147364, + "flos": 618950149632.0, + "grad_norm": 0.11328018299844213, + "language_loss": 0.70060062, + "learning_rate": 9.70586077619906e-11, + "loss": 0.71108079, + "num_input_tokens_seen": 430164864, + "router_z_loss_mlp": 0.12652588, + "step": 5197, + "time_per_iteration": 4.048620700836182 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01028912, + "balance_loss_mlp": 1.01973271, + "epoch": 1.0, + "flos": 1290737617920.0, + "grad_norm": 0.0323024732407461, + "language_loss": 0.84126532, + "learning_rate": 0.0, + "loss": 0.85155439, + "num_input_tokens_seen": 430340944, + "router_z_loss_mlp": 0.09185791, + "step": 5198, + "time_per_iteration": 5.656566858291626 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 430340944, + "step": 5198, + "total_flos": 1.171926856433664e+16, + "train_loss": 0.0, + "train_runtime": 8.6776, + "train_samples_per_second": 76668.108, + "train_steps_per_second": 599.011 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 430340944, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.171926856433664e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +} diff --git a/sft_pretrain/Full_smoe/training_args.bin b/sft_pretrain/Full_smoe/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..40700557efde4cd92c683630d55da293a6207d35 --- /dev/null +++ b/sft_pretrain/Full_smoe/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a606e327b153b50186d15c8ba216406d73d4c21c02aedd6a085b046e27455fd6 +size 7992